/*
 * systrace.c
 *
 * Copyright (c) 2002 Marius Aamodt Eriksen <marius@umich.edu>
 * Copyright (c) 2002 Niels Provos <provos@citi.umich.edu>
 *
 *  Redistribution and use in source and binary forms, with or without
 *  modification, are permitted provided that the following conditions
 *  are met:
 *
 *  1. Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *  2. Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in the
 *     documentation and/or other materials provided with the distribution.
 *  3. The names of the copyright holders may not be used to endorse or
 *     promote products derived from this software without specific
 *     prior written permission.
 *
 *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
 *  INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
 *  AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
 *  THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 *  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 *  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 *  OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 *  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
 *  OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
 *  ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * XXX clone()'s with same PID
 */

#include <linux/sched.h>
#include <linux/smp_lock.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/fs.h>
#include <linux/wait.h>
#include <linux/slab.h>
#include <linux/sys.h>
#include <linux/miscdevice.h>
#include <linux/queue.h>

#include <asm/semaphore.h>
#include <asm/uaccess.h>
#include <asm/ptrace.h>

#include <linux/queue.h>	
#include <linux/systrace.h>
#include <linux/poll.h>

#include "systrace-private.h"

#define FIXARGS(argsize, args, regs) do {   \
	switch (argsize) {                  \
	case 20:                            \
		args[4] = regs->edi;        \
	case 16:                            \
		args[3] = regs->esi;        \
	case 12:                            \
		args[2] = regs->edx;        \
	case 8:                             \
		args[1] = regs->ecx;        \
	case 4:                             \
		args[0] = regs->ebx;        \
	case 0:                             \
		break;                      \
	default:                            \
		printk(KERN_ERR "systrace: (FIXARGS) Illegal argument size %d\n", argsize);\
		BUG();                      \
	}                                   \
} while (0)

#define SAVEARGS(argsize, args, regs) do {  \
	switch (argsize) {                  \
	case 20:                            \
		regs->edi = args[4];        \
	case 16:                            \
		regs->esi = args[3];        \
	case 12:                            \
		regs->edx = args[2];        \
	case 8:                             \
		regs->ecx = args[1];        \
	case 4:                             \
		regs->ebx = args[0];        \
	case 0:                             \
		break;                      \
	default:                            \
		printk(KERN_ERR "systrace: Illegal argument size %d\n", argsize);\
		BUG();                      \
	}                                   \
} while (0)

#define PRINTARGS(argsize, regs) do {                \
	switch (argsize) {                           \
	case 20:                                     \
		printk("    edi: %lx\n", regs->edi); \
	case 16:                                     \
		printk("    esi: %lx\n", regs->esi); \
	case 12:                                     \
		printk("    edx: %lx\n", regs->edx); \
	case 8:                                      \
		printk("    ecx: %lx\n", regs->ecx); \
	case 4:                                      \
		printk("    ebx: %lx\n", regs->ebx); \
	case 0:                                      \
		break;                               \
	default:                                     \
		printk(KERN_ERR "systrace: Illegal argument size %d\n", argsize);\
		BUG();                               \
	}                                            \
} while (0)

#define SYSTRACE_MINOR 223

spinlock_t str_lck = SPIN_LOCK_UNLOCKED;
int systrace_debug = 0;

/* From kernel/sys.c */
static inline void cap_emulate_setxuid(int, int, int);
static long        __getcwd(char *, unsigned long);

/*
 * Pass by registers; we need the stack that the system call will see
 * in order to examine it and possibly modify.
 */

int  FASTCALL(systrace_intercept(struct pt_regs *));
void FASTCALL(systrace_result(struct pt_regs *));

static struct file_operations systrace_fops = {
	read:    &systracef_read,
	write:   &systracef_write,
	ioctl:   &systracef_ioctl,
	release: &systracef_release,
	open:    &systracef_open,
	poll:    &systracef_poll
};

static struct miscdevice systrace_dev = {
	SYSTRACE_MINOR,
	"systrace",
	&systrace_fops
};

void
_systrace_lock(void)
{
	spin_lock(&str_lck);
}

void
_systrace_unlock(void)
{
	spin_unlock(&str_lck);
}

int
init_systrace(void)
{
	if (misc_register(&systrace_dev) < 0) {
		printk(KERN_INFO "systrace: unable to register device\n");
		return (-EIO);
	}

	printk(KERN_INFO "systrace: systrace initialized\n");

	return (0);
}

int
systracef_open(struct inode *inode, struct file *file)
{
	struct fsystrace *fst;
	int error = 0;

	if ((fst = kmalloc(sizeof(*fst), GFP_KERNEL)) == NULL) {
		printk(KERN_ERR "systrace: Failed to allocate kernel memory.\n");
		error = 0;
		goto out;
	}

	memset(fst, 0, sizeof(*fst));

	TAILQ_INIT(&fst->processes);
	TAILQ_INIT(&fst->policies);
	TAILQ_INIT(&fst->messages);

	init_MUTEX(&fst->lock);
	init_waitqueue_head(&fst->wqh);

	fst->euid = current->euid;
	fst->egid = current->egid;
	fst->issuser = suser();
	fst->pid = current->pid;

	file->private_data = fst;

 out:
	return (error);
}

int
systracef_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
    unsigned long arg)
{
	struct fsystrace *fst = (struct fsystrace *)file->private_data;
	pid_t pid = 0;
	struct str_process *strp = NULL;
	int error = 0;
	void *data = NULL;

	if (fst == NULL) {
		printk(KERN_ERR "systrace: in impossible state!\n");
		BUG();
	}

	/* Argument santizing */
	switch (cmd) {
	case STRIOCATTACH:
	case STRIOCANSWER:
	case STRIOCIO:
	case STRIOCGETCWD:
	case STRIOCDETACH:
	case STRIOCPOLICY:
	case STRIOCREPLACE:
		if ((void *)arg == NULL)
			error = -EINVAL;
		break;
	case STRIOCRESCWD:
	default:
		break;
	}

	if (error != 0)
		goto out;

	switch (cmd) {
	case STRIOCANSWER:
		if ((data = kmalloc(sizeof(struct systrace_answer),
			 GFP_KERNEL)) == NULL) {
			error = -ENOSPC;
			break;
		}
		if (copy_from_user((struct systrace_answer *)data,
			(struct systrace_answer *)arg,
			sizeof(struct systrace_answer)) != 0) {
			kfree(data);
			error = -EFAULT;
			break;
		}

		pid = ((struct systrace_answer *)data)->stra_pid;
		break;
	case STRIOCIO:
		if ((data = kmalloc(sizeof(struct systrace_io),
			 GFP_KERNEL)) == NULL) { 
			error = -ENOSPC;
			break;
		}
		if (copy_from_user((struct systrace_io *)data,
			(struct systrace_io *)arg,
			sizeof(struct systrace_io)) != 0) {
			kfree(data);
			error = -EFAULT;
			break;
		}

		pid = ((struct systrace_io *)data)->strio_pid;
		break;
	case STRIOCGETCWD:
	case STRIOCDETACH:
		if (get_user(pid, (pid_t *)arg) != 0)
			error = -EFAULT;

		if (pid == 0)
			error = -EINVAL;
		break;
	case STRIOCATTACH:
	case STRIOCRESCWD:
		break;
	case STRIOCPOLICY:
		if ((data = kmalloc(sizeof(struct systrace_policy),
			 GFP_KERNEL)) == NULL) {
			error = -ENOSPC;
			break;
		}
		if (copy_from_user((struct systrace_policy *)data,
			(struct systrace_policy *)arg,
			sizeof(struct systrace_policy)) != 0) {
			kfree(data);
			error = -EFAULT;
			break;
		}
		break;
	case STRIOCREPLACE:
		if ((data = kmalloc(sizeof(struct systrace_replace),
			 GFP_KERNEL)) == NULL) {
			error = -ENOSPC;
			break;
		}
		if (copy_from_user((struct systrace_replace *)data,
			(struct systrace_replace *)arg,
			sizeof(struct systrace_replace)) != 0) {
			kfree(data);
			error = -EFAULT;
			break;
		}

		pid = ((struct systrace_replace *)data)->strr_pid;
		break;
	default:
		error = -EINVAL;
	}

	if (error != 0)
		goto out;

	systrace_lock();
	down(&fst->lock);
	systrace_unlock();

	if (pid != 0)
		if ((strp = systrace_findpid(fst, pid)) == NULL) {
			error = -EINVAL;
			goto unlock;
		}

	switch (cmd) {
	case STRIOCATTACH:
		if (get_user(pid, (pid_t *)arg) != 0)
			error = -EFAULT;

		if (pid == 0)
			error = -EINVAL;
		else
			error = systrace_attach(fst, *(pid_t *)arg);
		break;
	case STRIOCDETACH:
		error = systrace_detach(strp);
		break;
	case STRIOCANSWER:
		error = systrace_answer(strp, (struct systrace_answer *)data);
		break;
	case STRIOCIO:
		error = systrace_io(strp, (struct systrace_io *)data);
		break;
	case STRIOCGETCWD:
		error = systrace_getcwd(fst, strp);
		break;
	case STRIOCRESCWD:
		error = systrace_rescwd(fst);
		break;
	case STRIOCPOLICY:
		error = systrace_policy(fst, (struct systrace_policy *)data);
		if (copy_to_user((struct systrace_policy *)arg,
			(struct systrace_policy *)data,
			sizeof(struct systrace_policy)) != 0)
			error = -EFAULT;
		break;
	case STRIOCREPLACE:
		error = systrace_preprepl(strp, (struct systrace_replace *)data);
		break;
	default:
		/* XXX */
		break;
	}

	if (data != NULL)
		kfree(data);

 unlock:
	up(&fst->lock);
 out:
	return (error);
}

unsigned int
systracef_poll(struct file *file, struct poll_table_struct *wait)
{
	struct fsystrace *fst = (struct fsystrace *)file->private_data;
	unsigned int ret = 0;

	systrace_lock();
	down(&fst->lock);
	systrace_unlock();

	poll_wait(file, &fst->wqh, wait);

	if (TAILQ_FIRST(&fst->messages) != NULL)
		ret = POLLIN | POLLRDNORM;

	up(&fst->lock);

	return (ret);
}

ssize_t
systracef_read(struct file *filp, char *buf, size_t count, loff_t *off)
{
	struct fsystrace *fst = (struct fsystrace *)filp->private_data;
	struct str_process *strp;
	int error = 0;

	if (count != sizeof(struct str_message))
		return (-EINVAL);

 again:
	systrace_lock();
	down(&fst->lock);
	systrace_unlock();

	if ((strp = TAILQ_FIRST(&fst->messages)) != NULL) {
		error = copy_to_user(buf, &strp->msg, sizeof(struct str_message));
		if (error != 0) {
			error = -EFAULT;
		} else {
			error = sizeof(struct str_message);
			TAILQ_REMOVE(&fst->messages, strp, msg_next);
			CLR(strp->flags, STR_PROC_ONQUEUE);

			if (SYSTR_MSG_NOPROCESS(strp))
				kfree(strp);
		}
	} else if (TAILQ_FIRST(&fst->processes) == NULL) {
		/* EOF situation */
		;
	} else {
		if (filp->f_flags & O_NONBLOCK) {
			error = -EAGAIN;
		} else {
			up(&fst->lock);
			interruptible_sleep_on(&fst->wqh);

			if (signal_pending(current)) {
				error = -ERESTARTSYS;
				goto out;
			}
			goto again;
		}
	}

	up(&fst->lock);
 out:
	return (error);
}

ssize_t
systracef_write(struct file *filp, const char *buf, size_t count, loff_t *off)
{
	return (-ENOTSUPP);
}

int
systracef_release(struct inode *inode, struct file *filp)
{
	struct str_process *strp;
	struct fsystrace *fst = filp->private_data;
	struct str_policy *strpol;

	systrace_lock();
	down(&fst->lock);
	systrace_unlock();

	/* Kill all traced processes */
	while ((strp = TAILQ_FIRST(&fst->processes)) != NULL) {
		struct task_struct *p = strp->proc;

		systrace_detach(strp);
		kill_proc(p->pid, SIGKILL, 1);
	}

	/* Clean up fork and exit messages */
	while ((strp = TAILQ_FIRST(&fst->messages)) != NULL) {
		TAILQ_REMOVE(&fst->messages, strp, msg_next);
		kfree(strp);
	}

	/* Clean up policies */
	while ((strpol = TAILQ_FIRST(&fst->policies)) != NULL)
		systrace_closepolicy(fst, strpol);

	up(&fst->lock);

	kfree(filp->private_data);
	filp->private_data = NULL;

	return (0);
}

void
systrace_fork(struct task_struct *parent, struct task_struct *child)
{
	struct str_process *parentstrp, *strp;
	struct fsystrace *fst;

	systrace_lock();
	if ((parentstrp = parent->systrace) == NULL) {
		systrace_unlock();
		return;
	}

	fst = parentstrp->parent;
	down(&fst->lock);
	systrace_unlock();

	if (systrace_insert_process(fst, child) != 0) {
		printk(KERN_ERR "systrace: failed inserting process!\n");
		goto out;
	}

	/* XXX make sure we have pid by this time in fork() */
	if ((strp = systrace_findpid(fst, child->pid)) == NULL) {
		printk(KERN_ERR "systrace: inconsistency in tracked process!\n");
		BUG();
	}

	if ((strp->policy = parentstrp->policy) != NULL)
		strp->policy->refcount++;

	/* Fork message */
	systrace_msg_child(fst, parentstrp, child->pid);
 out:
	up(&fst->lock);
}

void
systrace_exit(struct task_struct *p)
{
	struct str_process *strp;
	struct fsystrace *fst;

	systrace_lock();
	if ((strp = p->systrace) != NULL) {
		fst = strp->parent;
		down(&fst->lock);
		systrace_unlock();

		/* Notify our monitor of our death */
		systrace_msg_child(fst, strp, -1);

		systrace_detach(strp);
		up(&fst->lock);
	} else {
		systrace_unlock();
	}
}

void
systrace_result(struct pt_regs *regs)
{
	struct str_process *strp;
	struct fsystrace *fst;
	int error, argsize, narg, code, maycontrol = 0;
	extern struct sysent linux_sysent[];
	char *path;

	systrace_lock();

	if ((strp = current->systrace) == NULL)
		goto out;

	code = strp->code;
	narg = linux_sysent[code].sy_narg;
	argsize = sizeof(register_t) * narg;

	fst = strp->parent;

	/* Restore elevated priveliges if appropriate */
	if (strp->issuser) {
		if (ISSET(strp->flags, STR_PROC_SETEUID)) {
			if (current->euid == strp->seteuid) {
				systrace_seteuid(current, strp->savedeuid);
				CLR(strp->flags, STR_PROC_SETEUID);
			}
			if (current->egid == strp->setegid) {
				systrace_setegid(current, strp->savedegid);
				CLR(strp->flags, STR_PROC_SETEGID);
			}
		}
	}

	/* Change in UID/GID */
	if (strp->oldegid != current->egid || strp->oldeuid != current->euid) {
		down(&fst->lock);
		systrace_unlock();

		systrace_msg_ugid(fst, strp);
		systrace_lock();
		if ((strp = current->systrace) == NULL)
			goto out;
	}

	if (ISSET(strp->flags, STR_PROC_SYSCALLRES)) {
		CLR(strp->flags, STR_PROC_SYSCALLRES);

 		down(&fst->lock);
		systrace_unlock();

		error = regs->eax;

		systrace_msg_result(fst, strp, error, code, argsize, strp->args);
		systrace_lock();
		if ((strp = current->systrace) == NULL)
			goto out;
	}

	if (strp->replace != NULL) {
		kfree(strp->replace);
		strp->replace = NULL;
	}

	if (ISSET(strp->flags, STR_PROC_FSCHANGE))
		set_fs(strp->oldfs);

 out:
	systrace_unlock();
}

/*
 * XXX serialize system calls
 */
int
systrace_intercept(struct pt_regs *regs)
{
	register_t args[8];
	int argsize, narg, code, error = 0, maycontrol = 0, issuser = 0;
	short policy;
	struct str_process *strp;
	struct fsystrace *fst = NULL;
	extern struct sysent linux_sysent[];
	struct str_policy *strpolicy;

	systrace_lock();

	if ((strp = current->systrace) == NULL) {
		systrace_unlock();
		goto out;
	}

	fst = strp->parent;

	down(&fst->lock);
	systrace_unlock();

	CLR(strp->flags, STR_PROC_FSCHANGE);

	if (regs != NULL) {
		code = regs->orig_eax;
	} else {
		error = -EPERM;
		goto out;
 	}

	if (code > NR_syscalls) {
		printk(KERN_ERR "systrace: in impossible state!\n");
		BUG();
	}

	narg = linux_sysent[code].sy_narg;
	argsize = sizeof(register_t) * narg;

	/*
	 * Linux passes system call arguments in registers.  We want
	 * to be able to pass back an args array; convert
	 * appropriately.
	 */

	FIXARGS(argsize, args, regs);

	if (strp->proc != current) {
		printk(KERN_ERR "systrace: inconsistency in process states!\n");
		BUG();
	}

	if (fst->issuser) {
		maycontrol = 1;
		issuser = 1;
	} else if (cap_isclear(current->cap_effective) &&
	    !(current->flags & PF_SUPERPRIV) &&
	    current->mm->dumpable) {
		maycontrol = fst->euid == current->euid &&
		    fst->egid == current->egid;
	}

	strp->code = code;
	strp->maycontrol = maycontrol;
	memcpy(strp->args, args, sizeof(strp->args));
	strp->oldeuid = current->euid;
	strp->oldegid = current->egid;
	strp->issuser = fst->issuser;

	if (!maycontrol) {
		policy = SYSTR_POLICY_PERMIT;
	} else {
		/* Find out current policy */
		if ((strpolicy = strp->policy) == NULL) {
			policy = SYSTR_POLICY_ASK;
		} else {
			if (code >= strpolicy->nsysent)
				policy = SYSTR_POLICY_NEVER;
			else
				policy = strpolicy->sysent[code];
		}
	}

	switch (policy) {
	case SYSTR_POLICY_PERMIT:
		break;
	case SYSTR_POLICY_ASK:
		error = systrace_msg_ask(fst, strp, code, argsize, args);
		/* systrace_msg_ask releases lock */
		fst = NULL;
		/* We might have detached by now for some reason */
		if (error == 0 && (strp = current->systrace) != NULL) {
			/* XXX - do I need to lock here? */
			if (strp->answer == SYSTR_POLICY_NEVER) {
				error = strp->error;
				if (strp->replace != NULL) {
					kfree(strp->replace);
					strp->replace = NULL;
				}
			} else if (strp->replace != NULL) {
				if ((error = systrace_replace(strp,
					 argsize, args) == 0)) {
					SAVEARGS(argsize, args, regs);
					strp->oldfs = get_fs();
					set_fs(get_ds());
					SET(strp->flags, STR_PROC_FSCHANGE);
				}
			}
		}
		break;
	case SYSTR_POLICY_NEVER:
		error = -EPERM;
		break;
	default:
		if (policy < 0)
			error = policy;
		else
			error = -EPERM;
		break;
	}

	/* XXX */
/*
	if (error != 0)
		goto out;
*/
	systrace_lock();
	if ((strp = current->systrace) != NULL) {
		if (issuser) {
			if (ISSET(strp->flags, STR_PROC_SETEUID))
				strp->savedeuid = systrace_seteuid(current, strp->seteuid);
			if (ISSET(strp->flags, STR_PROC_SETEGID))
				strp->savedegid = systrace_setegid(current, strp->setegid);
		} else {
			CLR(strp->flags, STR_PROC_SETEUID | STR_PROC_SETEGID);
		}
	}
	systrace_unlock();

 out:
	if (fst != NULL)
		up(&fst->lock);

	return (error);
}

int
systrace_preprepl(struct str_process *strp, struct systrace_replace *repl)
{
	size_t len;
	int i, error = 0;

	if ((error = systrace_processready(strp)) != 0)
		return (error);

	if (strp->replace != NULL) {
		kfree(strp->replace);
		strp->replace = NULL;
	}

	if (repl->strr_nrepl < 0 || repl->strr_nrepl > SYSTR_MAXARGS)
		return (-EINVAL);

	for (i = 0, len = 0; i < repl->strr_nrepl; i++) {
		len += repl->strr_offlen[i];
		if (repl->strr_offlen[i] == 0)
			continue;
		if (repl->strr_offlen[i] + repl->strr_off[i] > len)
			return (-EINVAL);
	}

	/* Make sure that the length adds up */
	if (repl->strr_len != len)
		return (-EINVAL);

	/* Check against a maximum length */
	if (repl->strr_len > 2048)
		return (-EINVAL);

	if ((strp->replace = kmalloc(sizeof(*strp->replace) + len, GFP_KERNEL))
	    == NULL) 
		return (-ENOSPC);

	memcpy(strp->replace, repl, sizeof(*strp->replace));

	if (copy_from_user(strp->replace + 1, repl->strr_base, len) != 0) {
		kfree(strp->replace);
		strp->replace = NULL;
		return (-EFAULT);
	}

	/* Adjust the offset */
	repl = strp->replace;
	repl->strr_base = (void *)(repl + 1);

	return (0);
}

/*
 * Replace the arguments with arguments from the monitoring process.
 */
int
systrace_replace(struct str_process *strp, size_t argsize, register_t args[])
{
	struct systrace_replace *repl = strp->replace;
	void *kbase;
	int i, maxarg, ind, ret = 0;

	maxarg = argsize / sizeof(register_t);

	kbase = repl->strr_base;
	for (i = 0; i < maxarg && i < repl->strr_nrepl; i++) {
		ind = repl->strr_argind[i];
		if (ind < 0 || ind >= maxarg) {
			kfree(repl);
			strp->replace = NULL;
			return (-EINVAL);
		}
		if (repl->strr_offlen[i] == 0) {
			args[ind] = repl->strr_off[i];
			continue;
		}

		/* Replace the argument with the new address */
		args[ind] = (register_t)(kbase + repl->strr_off[i]);
	}

	return (ret);
}

int
systrace_answer(struct str_process *strp, struct systrace_answer *ans)
{
	int error = 0;

	if (!POLICY_VALID(ans->stra_policy)) {
		error = -EINVAL;
		goto out;
	}

	/* Check if answer is in sync with us */
	if (ans->stra_seqnr != strp->seqnr) {
		error = -EINVAL;
		goto out;
	}

	if ((error = systrace_processready(strp)) != 0)
		goto out;

	strp->answer = ans->stra_policy;
	strp->error = ans->stra_error;
	if (!strp->error)
		strp->error = -EPERM;
	if (ISSET(ans->stra_flags, SYSTR_FLAGS_RESULT))
		SET(strp->flags, STR_PROC_SYSCALLRES);

        /* See if we should elevate privileges for this system call */
        if (ISSET(ans->stra_flags, SYSTR_FLAGS_SETEUID)) {
                SET(strp->flags, STR_PROC_SETEUID);
                strp->seteuid = ans->stra_seteuid;
        }
        if (ISSET(ans->stra_flags, SYSTR_FLAGS_SETEGID)) {
                SET(strp->flags, STR_PROC_SETEGID);
                strp->setegid = ans->stra_setegid;
        }

	/* Clearing the flag indicates to the process that it woke up */
	CLR(strp->flags, STR_PROC_WAITANSWER);
	wake_up(&strp->wqh);
 out:

	return (error);
}

int
systrace_io(struct str_process *strp, struct systrace_io *io)
{
	int rw, ret = 0, copied, maycontrol = 0;
	void *buf;
	struct fsystrace *fst = strp->parent;
	struct task_struct *tsk = strp->proc;

	if (fst->issuser) {
		maycontrol = 1;
	} else if (cap_isclear(tsk->cap_effective) &&
	    !(tsk->flags & PF_SUPERPRIV) &&
	    tsk->mm->dumpable) {
		maycontrol = current->euid == tsk->euid &&
		    current->egid == tsk->egid;
	}

	if (!maycontrol)
		return (-EPERM);

	if ((buf = kmalloc(io->strio_len, GFP_KERNEL)) == NULL) {
		printk(KERN_ERR "systrace: failed to allocate kernel memory!\n");
		return (-ENOMEM);
	}

	switch (io->strio_op) {
	case SYSTR_READ:
		rw = 0;
		break;
	case SYSTR_WRITE:
		rw = 1;
		if (copy_from_user(buf, io->strio_addr, io->strio_len)) {
			ret = -EFAULT;
			goto out;
		}
		break;
	default:
		return (-EINVAL);
	}

	copied = access_process_vm(tsk, (unsigned long)io->strio_offs, buf,
	    io->strio_len, rw);

	if (copied != io->strio_len) {
		ret = -EFAULT;
		goto out;
	}

	switch (io->strio_op) {
	case SYSTR_READ:
		if (copy_to_user(io->strio_addr, buf, io->strio_len)) {
			ret = -EFAULT;
			goto out;
		}
		break;
	}

 out:
	kfree(buf);

	return (ret);
}

int
systrace_getcwd(struct fsystrace *fst, struct str_process *strp)
{
	struct fs_struct *fsc, *fsp;
	int error = 0;

	if ((error = systrace_processready(strp)) != 0)
		return (error);

	task_lock(current);
	task_lock(strp->proc);
	fsc = current->fs;
	fsp = strp->proc->fs;

	if (fsc == NULL || fsp == NULL) {
		task_unlock(current);
		task_unlock(strp->proc);
		return (-EINVAL);
	}

	fst->pwd_pid = strp->pid;

	/* XXX altroot? */
	write_lock(&fsc->lock);

	fst->pwd_mnt = fsc->pwdmnt;
	fst->pwd_dentry = fsc->pwd;
	fst->root_mnt = fsc->rootmnt;
	fst->root_dentry = fsc->root;

	read_lock(&fsp->lock);
	fsc->pwdmnt = mntget(fsp->pwdmnt);
	fsc->pwd = dget(fsp->pwd);
	fsc->rootmnt = mntget(fsp->rootmnt);
	fsc->root = dget(fsp->root);
	read_unlock(&fsp->lock);

	write_unlock(&fsc->lock);

	task_unlock(current);
	task_unlock(strp->proc);

	return (0);
}

int
systrace_rescwd(struct fsystrace *fst)
{
	struct fs_struct *fsc;

	if (fst->pwd_pid == 0)
		return (-EINVAL);

	fsc = current->fs;

	write_lock(&fsc->lock);
	dput(fsc->pwd);
	mntput(fsc->pwdmnt);
	dput(fsc->root);
	mntput(fsc->rootmnt);

	fsc->pwd = fst->pwd_dentry;
	fsc->pwdmnt = fst->pwd_mnt;
	fsc->root = fst->root_dentry;
	fsc->rootmnt = fst->root_mnt;
	write_unlock(&fsc->lock);

	fst->pwd_pid = 0;

	return (0);
}

int
systrace_processready(struct str_process *strp)
{
	if (ISSET(strp->flags, STR_PROC_ONQUEUE))
		return (-EBUSY);

	if (!ISSET(strp->flags, STR_PROC_WAITANSWER))
		return (-EBUSY);

	if (ISSET(strp->proc->flags, PF_EXITING))
		return (-EBUSY);

#if 0
	if (strp->proc->state != 0)
		return (-EBUSY);
#endif /* 0 */

	return (0);
}

int
systrace_insert_process(struct fsystrace *fst, struct task_struct *p)
{
	struct str_process *strp;

	if ((strp = kmalloc(sizeof(*strp), GFP_KERNEL)) == NULL)
		return (-ENOMEM);

	memset(strp, 0, sizeof(*strp));

	strp->pid = p->pid;
	strp->proc = p;
	strp->parent = fst;

	init_waitqueue_head(&strp->wqh);
	init_MUTEX(&strp->lock);

	/* Insert into parent's process list */
	TAILQ_INSERT_TAIL(&fst->processes, strp, next);
	fst->nprocesses++;

	/* XXX need process flag*/
	p->systrace = strp;

	return (0);
}

struct str_process *
systrace_findpid(struct fsystrace *fst, pid_t pid)
{
	struct str_process *strp;
	struct task_struct *proc;

	TAILQ_FOREACH(strp, &fst->processes, next)
	    if (strp->pid == pid)
		    break;

	if (strp == NULL)
		return (NULL);

	proc = systrace_find(strp);

	return (proc != NULL ? strp : NULL);
}

int
systrace_attach(struct fsystrace *fst, pid_t pid)
{
	struct task_struct *proc;

	proc = find_task_by_pid(pid);
	if (proc == NULL)
		return (-EINVAL);

	/* (1) Same process */

	if (proc->pid == current->pid)
		return (-EINVAL);

	/* (2) System process */
	/* XXX */

	/* (3) Already being systraced */

	if (proc->systrace != NULL)
		return (-EBUSY);

	/*
	 * (4) We do not own it, it's not set{u,g}id AND we are not
	 *     root
	 */
	if ((!cap_isclear(proc->cap_permitted) || proc->flags & PF_SUPERPRIV ||
		proc->euid != current->euid || proc->egid != current->egid) &&
	    !suser())
		return (-EPERM);

	/* (5) It's init */
	if (proc->pid == 1)
		return (-EPERM);

	return (systrace_insert_process(fst, proc));
}

int
systrace_detach(struct str_process *strp)
{
	struct fsystrace *fst = strp->parent;
	struct task_struct *proc;
	int error = 0;

	if ((proc = systrace_find(strp)) != NULL)
		proc->systrace = NULL;
	else
		error = -EINVAL;

	if (ISSET(strp->flags, STR_PROC_WAITANSWER)) {
		CLR(strp->flags, STR_PROC_WAITANSWER);
		wake_up(&strp->wqh);
	}

	fst = strp->parent;
	wake_up(&fst->wqh);

	if (ISSET(strp->flags, STR_PROC_ONQUEUE)) 
		TAILQ_REMOVE(&fst->messages, strp, msg_next);

	TAILQ_REMOVE(&fst->processes, strp, next);
	fst->nprocesses--;

	if (strp->policy != NULL)
		systrace_closepolicy(fst, strp->policy);
	if (strp->replace != NULL)
		kfree(strp->replace);

	kfree(strp);

	return (error);
}

int
systrace_msg_result(struct fsystrace *fst, struct str_process *strp,
    int error, int code, size_t argsize, register_t args[])
{
	struct str_msg_ask *msg_ask = &strp->msg.msg_data.msg_ask;
	int i;

	msg_ask->code = code;
	/* XXX argsize */
	/* += fixup_socket_argsize ... () */
	msg_ask->argsize = argsize;
	msg_ask->result = error;
	for (i = 0; i < argsize / sizeof(register_t) && i < SYSTR_MAXARGS; i++)
		msg_ask->args[i] = args[i];

	msg_ask->rval[0] = 0x42;
	msg_ask->rval[1] = 0x42;

	return (systrace_make_msg(strp, SYSTR_MSG_RES));
}

int
systrace_msg_ask(struct fsystrace *fst, struct str_process *strp, int code,
    size_t argsize, register_t args[])
{
	struct str_msg_ask *msg_ask = &strp->msg.msg_data.msg_ask;
	int i;

	msg_ask->code = code;
	/* XXX argsize */
	msg_ask->argsize = argsize;
	for (i = 0; i < (argsize / sizeof(register_t)) && i < SYSTR_MAXARGS; i++)
		msg_ask->args[i] = args[i];

	return (systrace_make_msg(strp, SYSTR_MSG_ASK));
}

int
systrace_msg_ugid(struct fsystrace *fst, struct str_process *strp)  
{
        struct str_msg_ugid *msg_ugid = &strp->msg.msg_data.msg_ugid;
        struct task_struct *tsk = strp->proc;

        msg_ugid->uid = tsk->euid;
        msg_ugid->gid = tsk->egid;

        return (systrace_make_msg(strp, SYSTR_MSG_UGID));
}

int
systrace_msg_execve(struct fsystrace *fst, struct str_process *strp, register_t patharg)
{
        struct str_msg_execve *msg_execve = &strp->msg.msg_data.msg_execve;

	msg_execve->patharg = patharg;

        return (systrace_make_msg(strp, SYSTR_MSG_EXECVE));
}

int
systrace_msg_child(struct fsystrace *fst, struct str_process *strp, pid_t npid)
{
	struct str_process *nstrp;
	struct str_message *msg;
	struct str_msg_child *msg_child;

	/* XXX - use kmem cache!@; pool_*() like interface to it? */
	if ((nstrp = kmalloc(sizeof(*nstrp), GFP_KERNEL)) == NULL)
		return (-1);

	memset(nstrp, 0, sizeof(*nstrp));

	DPRINTF(("%s: %p: pid %d -> pid %d\n", __func__, nstrp, strp->pid, npid));

	msg = &nstrp->msg;
	msg_child = &msg->msg_data.msg_child;

	msg->msg_type = SYSTR_MSG_CHILD;
	msg->msg_pid = strp->pid;
	if (strp->policy)
		msg->msg_policy = strp->policy->nr;
	else
		msg->msg_policy = -1;
	msg_child->new_pid = npid;

	TAILQ_INSERT_TAIL(&fst->messages, nstrp, msg_next);

	wake_up(&fst->wqh);

	return (0);
}

int
systrace_make_msg(struct str_process *strp, int type)
{
	struct str_message *msg = &strp->msg;
	struct fsystrace *fst = strp->parent;
	int error = 0;

	msg->msg_seqnr = ++strp->seqnr;
	msg->msg_type = type;
	msg->msg_pid = strp->pid;

	if (strp->policy)
		msg->msg_policy = strp->policy->nr;
	else
		msg->msg_policy = -1;

	SET(strp->flags, STR_PROC_WAITANSWER);
	if (ISSET(strp->flags, STR_PROC_ONQUEUE))
		goto out;

	TAILQ_INSERT_TAIL(&fst->messages, strp, msg_next);
	SET(strp->flags, STR_PROC_ONQUEUE);
	/*
	 * XXX; need to do schedule trick here; what if we sleep on
	 * up(), then we might have awoken again, without knowing
	 */
 out:
	wake_up(&fst->wqh);
	lock_kernel();
	up(&fst->lock);

	/* Sleep until we have got a reply */
	for (;;) {
		interruptible_sleep_on(&strp->wqh);

		if (signal_pending(current)) {
			error = -EINTR;
			break;
		}

		/* If we detach, then everything is permitted */
		if ((strp = current->systrace) == NULL)
			break;

		if (!ISSET(strp->flags, STR_PROC_WAITANSWER))
			break;
	}

	unlock_kernel();

	return (0);
}

uid_t
systrace_seteuid(struct task_struct *tsk, uid_t euid)
{
	uid_t oldeuid = tsk->euid;

	if (euid == oldeuid)
		return (oldeuid);

	/* XXX */
	tsk->mm->dumpable = 0;
	wmb();

	tsk->euid = euid;
	tsk->fsuid = euid;

	if (!issecure(SECURE_NO_SETUID_FIXUP))
		cap_emulate_setxuid(tsk->uid, oldeuid, tsk->suid);

	return (oldeuid);
}

gid_t
systrace_setegid(struct task_struct *tsk, gid_t egid)
{
	uid_t oldegid = tsk->egid;

	if (egid == oldegid)
		return (oldegid);

	/* XXX */
	tsk->mm->dumpable = 0;
	wmb();

	tsk->egid = egid;
	tsk->fsgid = egid;

	return (oldegid);
}

struct task_struct *
systrace_find(struct str_process *strp)
{
        struct task_struct *proc;

        if ((proc = find_task_by_pid(strp->pid)) == NULL)
                return (NULL);

        if (proc != strp->proc)
                return (NULL);

	if (proc->systrace == NULL)
                return (NULL);

        return (proc);
}


/*
 * From kernel/sys.c
 */

static inline void cap_emulate_setxuid(int old_ruid, int old_euid, int old_suid)
{
        if ((old_ruid == 0 || old_euid == 0 || old_suid == 0) &&
            (current->uid != 0 && current->euid != 0 && current->suid != 0) &&
            !current->keep_capabilities) {
                cap_clear(current->cap_permitted);
                cap_clear(current->cap_effective);
        }
        if (old_euid == 0 && current->euid != 0) {
                cap_clear(current->cap_effective);
        }
        if (old_euid != 0 && current->euid == 0) {
                current->cap_effective = current->cap_permitted;
        }
}
