runtime/stp_utrace.c

runtime/stp_utrace.c - systemtap

Source code

/*

 * utrace infrastructure interface for debugging user processes

 *

 * Copyright (C) 2006-2014 Red Hat, Inc.  All rights reserved.

 *

 * This copyrighted material is made available to anyone wishing to use,

 * modify, copy, or redistribute it subject to the terms and conditions

 * of the GNU General Public License v.2.

 *

 * Heavily based on the original utrace code by Roland McGrath.

 */



#ifndef _STP_UTRACE_C

‌#define _STP_UTRACE_C



#if (!defined(STAPCONF_UTRACE_VIA_TRACEPOINTS))

#error "STAPCONF_UTRACE_VIA_TRACEPOINTS must be defined."

#endif



#include "stp_utrace.h"

#include <linux/list.h>

#include <linux/sched.h>

#include <linux/freezer.h>

#include <linux/slab.h>

#include <trace/events/sched.h>

#include <trace/events/syscalls.h>

#include "stp_task_work.c"

#include "linux/stp_tracepoint.h"



#include "stp_helper_lock.h"



/*

 * Per-thread structure private to utrace implementation.

 * If task_struct.utrace_flags is nonzero, task_struct.utrace

 * has always been allocated first.  Once allocated, it is

 * never freed until free_task().

 *

 * The common event reporting loops are done by the task making the

 * report without ever taking any locks.  To facilitate this, the two

 * lists @attached and @attaching work together for smooth asynchronous

 * attaching with low overhead.  Modifying either list requires @lock.

 * The @attaching list can be modified any time while holding @lock.

 * New engines being attached always go on this list.

 *

 * The @attached list is what the task itself uses for its reporting

 * loops.  When the task itself is not quiescent, it can use the

 * @attached list without taking any lock.  Nobody may modify the list

 * when the task is not quiescent.  When it is quiescent, that means

 * that it won't run again without taking @lock itself before using

 * the list.

 *

 * At each place where we know the task is quiescent (or it's current),

 * while holding @lock, we call splice_attaching(), below.  This moves

 * the @attaching list members on to the end of the @attached list.

 * Since this happens at the start of any reporting pass, any new

 * engines attached asynchronously go on the stable @attached list

 * in time to have their callbacks seen.

 */

‌struct utrace {

    stp_spinlock_t lock;

    struct list_head attached, attaching;



    struct utrace_engine *reporting;



    enum utrace_resume_action resume:UTRACE_RESUME_BITS;

    unsigned int vfork_stop:1; /* need utrace_stop() before vfork wait */

    unsigned int death:1;    /* in utrace_report_death() now */

    unsigned int reap:1;    /* release_task() has run */

    unsigned int pending_attach:1; /* need splice_attaching() */

    unsigned int task_work_added:1; /* called task_work_add() on 'work' */

    unsigned int report_work_added:1; /* called task_work_add()

                       * on 'report_work' */



    unsigned long utrace_flags;



    struct hlist_node hlist;       /* task_utrace_table linkage */

    struct task_struct *task;



    struct task_work work;

    struct task_work report_work;

};



‌#define TASK_UTRACE_HASH_BITS 5

‌#define TASK_UTRACE_TABLE_SIZE (1 << TASK_UTRACE_HASH_BITS)



‌static struct hlist_head task_utrace_table[TASK_UTRACE_TABLE_SIZE];

//DEFINE_MUTEX(task_utrace_mutex);      /* Protects task_utrace_table */

static STP_DEFINE_SPINLOCK(task_utrace_lock); /* Protects task_utrace_table */



‌static struct kmem_cache *utrace_cachep;

‌static struct kmem_cache *utrace_engine_cachep;

‌static const struct utrace_engine_ops utrace_detached_ops; /* forward decl */



static void utrace_report_clone(void *cb_data __attribute__ ((unused)),

                struct task_struct *task,

                struct task_struct *child);

static void utrace_report_death(void *cb_data __attribute__ ((unused)),

                struct task_struct *task);

static void utrace_report_syscall_entry(void *cb_data __attribute__ ((unused)),

                    struct pt_regs *regs, long id);

static void utrace_report_syscall_exit(void *cb_data __attribute__ ((unused)),

                       struct pt_regs *regs, long ret);



static void utrace_report_exec(void *cb_data __attribute__ ((unused)),

                   struct task_struct *task,

                   pid_t old_pid __attribute__((unused)),

                   struct linux_binprm *bprm __attribute__ ((unused)));



‌#define __UTRACE_UNREGISTERED    0

‌#define __UTRACE_REGISTERED    1

‌static atomic_t utrace_state = ATOMIC_INIT(__UTRACE_UNREGISTERED);



// If wake_up_state() is exported, use it.

#if defined(STAPCONF_WAKE_UP_STATE_EXPORTED)

‌#define stp_wake_up_state wake_up_state

// Otherwise, try to use try_to_wake_up(). The wake_up_state()

// function is just a wrapper around try_to_wake_up().

#elif defined(STAPCONF_TRY_TO_WAKE_UP_EXPORTED)

‌static inline int stp_wake_up_state(struct task_struct *p, unsigned int state)

{

    return try_to_wake_up(p, state, 0);

}

// Otherwise, we'll have to look up wake_up_state() with kallsyms.

#else

‌typedef typeof(&wake_up_state) wake_up_state_fn;

‌#define stp_wake_up_state (* (wake_up_state_fn)kallsyms_wake_up_state)

#endif



#if !defined(STAPCONF_SIGNAL_WAKE_UP_STATE_EXPORTED)

// Sigh. On kernel's without signal_wake_up_state(), there is no

// declaration to use in 'typeof(&signal_wake_up_state)'. So, we'll

// provide one here.

void signal_wake_up_state(struct task_struct *t, unsigned int state);



// First typedef from the original decl, then #define as typecasted call.

‌typedef typeof(&signal_wake_up_state) signal_wake_up_state_fn;

‌#define signal_wake_up_state (* (signal_wake_up_state_fn)kallsyms_signal_wake_up_state)

#endif



#if !defined(STAPCONF_SIGNAL_WAKE_UP_EXPORTED)

// First typedef from the original decl, then #define as typecasted call.

‌typedef typeof(&signal_wake_up) signal_wake_up_fn;

‌#define signal_wake_up (* (signal_wake_up_fn)kallsyms_signal_wake_up)

#endif



#if !defined(STAPCONF___LOCK_TASK_SIGHAND_EXPORTED)

// First typedef from the original decl, then #define as typecasted call.

‌typedef typeof(&__lock_task_sighand) __lock_task_sighand_fn;

‌#define __lock_task_sighand (* (__lock_task_sighand_fn)kallsyms___lock_task_sighand)



/*

 * __lock_task_sighand() is called from the inline function

 * 'lock_task_sighand'. Since the real inline function won't know

 * anything about our '#define' above, we have to have our own version

 * of the inline function.  Sigh.

 */

static inline struct sighand_struct *

‌stp_lock_task_sighand(struct task_struct *tsk, unsigned long *flags)

{

    struct sighand_struct *ret;



    ret = __lock_task_sighand(tsk, flags);

    (void)__cond_lock(&tsk->sighand->siglock, ret);

    return ret;

}

#else

‌#define stp_lock_task_sighand lock_task_sighand

#endif





/*

 * Our internal version of signal_wake_up()/signal_wake_up_state()

 * that handles the functions existing and being exported.

 */

static inline void

‌stp_signal_wake_up(struct task_struct *t, bool resume)

{

#if defined(STAPCONF_SIGNAL_WAKE_UP_STATE_EXPORTED)

    signal_wake_up_state(t, resume ? TASK_WAKEKILL : 0);

#elif defined(STAPCONF_SIGNAL_WAKE_UP_EXPORTED)

    signal_wake_up(t, resume);

#else

    if (kallsyms_signal_wake_up_state) {

    signal_wake_up_state(t, resume ? TASK_WAKEKILL : 0);

    }

    else if (kallsyms_signal_wake_up) {

    signal_wake_up(t, resume);

    }

#endif

}





‌static int utrace_init(void)

{

    int i;

    int rc = -1;

        static char kmem_cache1_name[50];

        static char kmem_cache2_name[50];



    if (unlikely(stp_task_work_init() != 0))

        goto error;



    /* initialize the list heads */

    for (i = 0; i < TASK_UTRACE_TABLE_SIZE; i++) {

        INIT_HLIST_HEAD(&task_utrace_table[i]);

    }



#if !defined(STAPCONF_TRY_TO_WAKE_UP_EXPORTED) \

    && !defined(STAPCONF_WAKE_UP_STATE_EXPORTED)

    kallsyms_wake_up_state = (void *)kallsyms_lookup_name("wake_up_state");

        if (kallsyms_wake_up_state == NULL) {

        _stp_error("Can't resolve wake_up_state!");

        goto error;

        }

#endif

#if !defined(STAPCONF_SIGNAL_WAKE_UP_STATE_EXPORTED)

    /* The signal_wake_up_state() function (which replaces

     * signal_wake_up() in newer kernels) isn't exported. Look up

     * that function address. */

        kallsyms_signal_wake_up_state = (void *)kallsyms_lookup_name("signal_wake_up_state");

#endif

#if !defined(STAPCONF_SIGNAL_WAKE_UP_EXPORTED)

    /* The signal_wake_up() function isn't exported. Look up that

     * function address. */

        kallsyms_signal_wake_up = (void *)kallsyms_lookup_name("signal_wake_up");

#endif

#if (!defined(STAPCONF_SIGNAL_WAKE_UP_STATE_EXPORTED) \

     && !defined(STAPCONF_SIGNAL_WAKE_UP_EXPORTED))

        if (kallsyms_signal_wake_up_state == NULL

        && kallsyms_signal_wake_up == NULL) {

        _stp_error("Can't resolve signal_wake_up_state or signal_wake_up!");

        goto error;

        }

#endif

#if !defined(STAPCONF___LOCK_TASK_SIGHAND_EXPORTED)

    /* The __lock_task_sighand() function isn't exported. Look up

     * that function address. */

        kallsyms___lock_task_sighand = (void *)kallsyms_lookup_name("__lock_task_sighand");

        if (kallsyms___lock_task_sighand == NULL) {

        _stp_error("Can't resolve __lock_task_sighand!");

        goto error;

        }

#endif



        /* PR14781: avoid kmem_cache naming collisions (detected by CONFIG_DEBUG_VM)

           by plopping a non-conflicting token - in this case the address of a

           locally relevant variable - into the names. */

        snprintf(kmem_cache1_name, sizeof(kmem_cache1_name),

                 "utrace_%lx", (unsigned long) (& utrace_cachep));

    utrace_cachep = kmem_cache_create(kmem_cache1_name,

                                          sizeof(struct utrace),

                                          0, 0, NULL);

    if (unlikely(!utrace_cachep))

        goto error;



        snprintf(kmem_cache2_name, sizeof(kmem_cache2_name),

                 "utrace_engine_%lx", (unsigned long) (& utrace_engine_cachep));

    utrace_engine_cachep = kmem_cache_create(kmem_cache2_name,

                                                 sizeof(struct utrace_engine),

                                                 0, 0, NULL);

    if (unlikely(!utrace_engine_cachep))

        goto error;



    rc = STP_TRACE_REGISTER(sched_process_fork, utrace_report_clone);

    if (unlikely(rc != 0)) {

        _stp_error("register_trace_sched_process_fork failed: %d", rc);

        goto error;

    }

    rc = STP_TRACE_REGISTER(sched_process_exit, utrace_report_death);

    if (unlikely(rc != 0)) {

        _stp_error("register_trace_sched_process_exit failed: %d", rc);

        goto error2;

    }

    rc = STP_TRACE_REGISTER(sys_enter, utrace_report_syscall_entry);

    if (unlikely(rc != 0)) {

        _stp_error("register_trace_sys_enter failed: %d", rc);

        goto error3;

    }

    rc = STP_TRACE_REGISTER(sys_exit, utrace_report_syscall_exit);

    if (unlikely(rc != 0)) {

        _stp_error("register_trace_sys_exit failed: %d", rc);

        goto error4;

    }



    rc = STP_TRACE_REGISTER(sched_process_exec, utrace_report_exec);

    if (unlikely(rc != 0)) {

        _stp_error("register_sched_process_exec failed: %d", rc);

        goto error5;

    }



    atomic_set(&utrace_state, __UTRACE_REGISTERED);

    return 0;



error5:

    STP_TRACE_UNREGISTER(sys_exit, utrace_report_syscall_exit);

error4:

    STP_TRACE_UNREGISTER(sys_enter, utrace_report_syscall_entry);

error3:

    STP_TRACE_UNREGISTER(sched_process_exit, utrace_report_death);

error2:

    STP_TRACE_UNREGISTER(sched_process_fork, utrace_report_clone);

    tracepoint_synchronize_unregister();

error:

    if (utrace_cachep) {

        kmem_cache_destroy(utrace_cachep);

        utrace_cachep = NULL;

    }

    if (utrace_engine_cachep) {

        kmem_cache_destroy(utrace_engine_cachep);

        utrace_engine_cachep = NULL;

    }

    return rc;

}



‌static int utrace_exit(void)

{

#ifdef STP_TF_DEBUG

    printk(KERN_ERR "%s:%d - entry\n", __FUNCTION__, __LINE__);

#endif

    utrace_shutdown();

    stp_task_work_exit();



    /* After utrace_shutdown() and stp_task_work_exit() (and the

     * code in stap_stop_task_finder()), we're *sure* there are no

     * tracepoint probes or task work items running or scheduled

     * to be run. So, now would be a great time to actually free

     * everything. */



    if (utrace_cachep) {

        kmem_cache_destroy(utrace_cachep);

        utrace_cachep = NULL;

    }

    if (utrace_engine_cachep) {

        kmem_cache_destroy(utrace_engine_cachep);

        utrace_engine_cachep = NULL;

    }



#ifdef STP_TF_DEBUG

    printk(KERN_ERR "%s:%d - exit\n", __FUNCTION__, __LINE__);

#endif

    return 0;

}



/*

 * stp_task_notify_resume() is our version of

 * set_notify_resume(). When called, the task_work infrastructure will

 * cause utrace_resume() to get called.

 */

static void

‌stp_task_notify_resume(struct task_struct *target, struct utrace *utrace)

{

    if (! utrace->task_work_added) {

        int rc = stp_task_work_add(target, &utrace->work);

        if (rc == 0) {

            utrace->task_work_added = 1;

        }

        /* stp_task_work_add() returns -ESRCH if the task has

         * already passed exit_task_work(). Just ignore this

         * error. */

        else if (rc != -ESRCH) {

            printk(KERN_ERR "%s:%d - task_work_add() returned %d\n",

                   __FUNCTION__, __LINE__, rc);

        }

    }

}



static void utrace_resume(struct task_work *work);

static void utrace_report_work(struct task_work *work);



/*

 * Clean up everything associated with @task.utrace.

 *

 * This routine must be called under the task_utrace_lock.

 */

‌static void utrace_cleanup(struct utrace *utrace)

{

    struct utrace_engine *engine, *next;



    lockdep_assert_held(&task_utrace_lock);



    /* Free engines associated with the struct utrace, starting

     * with the 'attached' list then doing the 'attaching' list. */

    stp_spin_lock(&utrace->lock);

    list_for_each_entry_safe(engine, next, &utrace->attached, entry) {

#ifdef STP_TF_DEBUG

        printk(KERN_ERR "%s:%d - removing engine\n",

           __FUNCTION__, __LINE__);

#endif

        list_del_init(&engine->entry);

        /* FIXME: hmm, should this be utrace_engine_put()? */

        kmem_cache_free(utrace_engine_cachep, engine);

    }

    list_for_each_entry_safe(engine, next, &utrace->attaching, entry) {

        list_del(&engine->entry);

        kmem_cache_free(utrace_engine_cachep, engine);

    }



    if (utrace->task_work_added) {

#ifdef STP_TF_DEBUG

        if (stp_task_work_cancel(utrace->task, &utrace_resume) == NULL)

            printk(KERN_ERR "%s:%d - task_work_cancel() failed? task %p, %d, %s\n",

                   __FUNCTION__, __LINE__, utrace->task,

                   utrace->task->tgid,

                   (utrace->task->comm ? utrace->task->comm

                : "UNKNOWN"));

#else

        stp_task_work_cancel(utrace->task, &utrace_resume);

#endif

        utrace->task_work_added = 0;

    }

    if (utrace->report_work_added) {

#ifdef STP_TF_DEBUG

        if (stp_task_work_cancel(utrace->task, &utrace_report_work) == NULL)

            printk(KERN_ERR "%s:%d - task_work_cancel() failed? task %p, %d, %s\n",

                   __FUNCTION__, __LINE__, utrace->task,

                   utrace->task->tgid,

                   (utrace->task->comm ? utrace->task->comm

                : "UNKNOWN"));

#else

        stp_task_work_cancel(utrace->task, &utrace_report_work);

#endif

        utrace->report_work_added = 0;

    }

    stp_spin_unlock(&utrace->lock);



    /* Free the struct utrace itself. */

    kmem_cache_free(utrace_cachep, utrace);

#ifdef STP_TF_DEBUG

    printk(KERN_ERR "%s:%d exit\n", __FUNCTION__, __LINE__);

#endif

}



‌static void utrace_shutdown(void)

{

    int i;

    struct utrace *utrace;

    struct hlist_head *head;

    struct hlist_node *node, *node2;



    if (atomic_read(&utrace_state) != __UTRACE_REGISTERED)

        return;

    atomic_set(&utrace_state, __UTRACE_UNREGISTERED);



#ifdef STP_TF_DEBUG

    printk(KERN_ERR "%s:%d entry\n", __FUNCTION__, __LINE__);

#endif

    /* Unregister all the tracepoint probes. */

    STP_TRACE_UNREGISTER(sched_process_exec, utrace_report_exec);

    STP_TRACE_UNREGISTER(sched_process_fork, utrace_report_clone);

    STP_TRACE_UNREGISTER(sched_process_exit, utrace_report_death);

    STP_TRACE_UNREGISTER(sys_enter, utrace_report_syscall_entry);

    STP_TRACE_UNREGISTER(sys_exit, utrace_report_syscall_exit);



    /* When tracepoint_synchronize_unregister() returns, all

     * currently executing tracepoint probes will be finished. */

    tracepoint_synchronize_unregister();



    /* (We'd like to wait here until all currrently executing

     * task_work items are finished (by calling

     * stp_task_work_exit()), but that gets stuck.)

     *

     * After the code above we're *sure* there are no tracepoint

     * probes running (or scheduled to be run). There could be

     * currently running task_work items.  Go ahead and cleanup

     * everything.  Currently running items should be OK, since

     * utrace_cleanup() just puts the memory back into the utrace

     * kmem caches. */

#ifdef STP_TF_DEBUG

    printk(KERN_ERR "%s:%d - freeing task-specific\n", __FUNCTION__, __LINE__);

#endif

    stp_spin_lock(&task_utrace_lock);

    for (i = 0; i < TASK_UTRACE_TABLE_SIZE; i++) {

        head = &task_utrace_table[i];

        stap_hlist_for_each_entry_safe(utrace, node, node2, head,

                           hlist) {

            hlist_del(&utrace->hlist);

            utrace_cleanup(utrace);

        }

    }

    stp_spin_unlock(&task_utrace_lock);

#ifdef STP_TF_DEBUG

    printk(KERN_ERR "%s:%d - done\n", __FUNCTION__, __LINE__);

#endif

}



/*

 * This routine must be called under the task_utrace_lock.

 */

‌static struct utrace *__task_utrace_struct(struct task_struct *task)

{

    struct hlist_head *head;

    struct hlist_node *node;

    struct utrace *utrace;



    lockdep_assert_held(&task_utrace_lock);

    head = &task_utrace_table[hash_ptr(task, TASK_UTRACE_HASH_BITS)];

    stap_hlist_for_each_entry(utrace, node, head, hlist) {

        if (utrace->task == task)

            return utrace;

    }

    return NULL;

}



/*

 * Set up @task.utrace for the first time.  We can have races

 * between two utrace_attach_task() calls here.  The task_lock()

 * governs installing the new pointer.  If another one got in first,

 * we just punt the new one we allocated.

 *

 * This returns false only in case of a memory allocation failure.

 */

‌static bool utrace_task_alloc(struct task_struct *task)

{

    struct utrace *utrace = kmem_cache_zalloc(utrace_cachep, GFP_IOFS);

    struct utrace *u;



    if (unlikely(!utrace))

        return false;

    stp_spin_lock_init(&utrace->lock);

    INIT_LIST_HEAD(&utrace->attached);

    INIT_LIST_HEAD(&utrace->attaching);

    utrace->resume = UTRACE_RESUME;

    utrace->task = task;

    stp_init_task_work(&utrace->work, &utrace_resume);

    stp_init_task_work(&utrace->report_work, &utrace_report_work);



    stp_spin_lock(&task_utrace_lock);

    u = __task_utrace_struct(task);

    if (u == NULL) {

        hlist_add_head(&utrace->hlist,

                   &task_utrace_table[hash_ptr(task, TASK_UTRACE_HASH_BITS)]);

    }

    else {

        kmem_cache_free(utrace_cachep, utrace);

    }

    stp_spin_unlock(&task_utrace_lock);



    return true;

}



/*

 * Correctly free a @utrace structure.

 *

 * Originally, this function was called via tracehook_free_task() from

 * free_task() when @task is being deallocated. But free_task() has no

 * tracepoint we can easily hook.

 */

‌static void utrace_free(struct utrace *utrace)

{

    if (unlikely(!utrace))

        return;



    /* Remove this utrace from the mapping list of tasks to

     * struct utrace. */

    stp_spin_lock(&task_utrace_lock);

    hlist_del(&utrace->hlist);

    stp_spin_unlock(&task_utrace_lock);



    /* Free the utrace struct. */

    stp_spin_lock(&utrace->lock);

#ifdef STP_TF_DEBUG

    if (unlikely(utrace->reporting)

        || unlikely(!list_empty(&utrace->attached))

        || unlikely(!list_empty(&utrace->attaching)))

        printk(KERN_ERR "%s:%d - reporting? %p, attached empty %d, attaching empty %d\n",

               __FUNCTION__, __LINE__, utrace->reporting,

               list_empty(&utrace->attached),

               list_empty(&utrace->attaching));

#endif



    if (utrace->task_work_added) {

        if (stp_task_work_cancel(utrace->task, &utrace_resume) == NULL)

            printk(KERN_ERR "%s:%d - task_work_cancel() failed? task %p, %d, %s\n",

                   __FUNCTION__, __LINE__, utrace->task,

                   utrace->task->tgid,

                   (utrace->task->comm ? utrace->task->comm

                : "UNKNOWN"));

        utrace->task_work_added = 0;

    }

    if (utrace->report_work_added) {

        if (stp_task_work_cancel(utrace->task, &utrace_report_work) == NULL)

            printk(KERN_ERR "%s:%d - task_work_cancel() failed? task %p, %d, %s\n",

                   __FUNCTION__, __LINE__, utrace->task,

                   utrace->task->tgid,

                   (utrace->task->comm ? utrace->task->comm

                : "UNKNOWN"));

        utrace->report_work_added = 0;

    }

    stp_spin_unlock(&utrace->lock);



    kmem_cache_free(utrace_cachep, utrace);

}



‌static struct utrace *task_utrace_struct(struct task_struct *task)

{

    struct utrace *utrace;



    stp_spin_lock(&task_utrace_lock);

    utrace = __task_utrace_struct(task);

    stp_spin_unlock(&task_utrace_lock);

    return utrace;

}



/*

 * This is called when the task is safely quiescent, i.e. it won't consult

 * utrace->attached without the lock.  Move any engines attached

 * asynchronously from @utrace->attaching onto the @utrace->attached list.

 */

‌static void splice_attaching(struct utrace *utrace)

{

    lockdep_assert_held(&utrace->lock);

    list_splice_tail_init(&utrace->attaching, &utrace->attached);

    utrace->pending_attach = 0;

}



/*

 * This is the exported function used by the utrace_engine_put() inline.

 */

‌static void __utrace_engine_release(struct kref *kref)

{

    struct utrace_engine *engine = container_of(kref, struct utrace_engine,

                            kref);

    BUG_ON(!list_empty(&engine->entry));

    if (engine->release)

        (*engine->release)(engine->data);

    kmem_cache_free(utrace_engine_cachep, engine);

}



‌static bool engine_matches(struct utrace_engine *engine, int flags,

               const struct utrace_engine_ops *ops, void *data)

{

    if ((flags & UTRACE_ATTACH_MATCH_OPS) && engine->ops != ops)

        return false;

    if ((flags & UTRACE_ATTACH_MATCH_DATA) && engine->data != data)

        return false;

    return engine->ops && engine->ops != &utrace_detached_ops;

}



‌static struct utrace_engine *find_matching_engine(

    struct utrace *utrace, int flags,

    const struct utrace_engine_ops *ops, void *data)

{

    struct utrace_engine *engine;

    list_for_each_entry(engine, &utrace->attached, entry)

        if (engine_matches(engine, flags, ops, data))

            return engine;

    list_for_each_entry(engine, &utrace->attaching, entry)

        if (engine_matches(engine, flags, ops, data))

            return engine;

    return NULL;

}



/*

 * Enqueue @engine, or maybe don't if UTRACE_ATTACH_EXCLUSIVE.

 */

‌static int utrace_add_engine(struct task_struct *target,

                 struct utrace *utrace,

                 struct utrace_engine *engine,

                 int flags,

                 const struct utrace_engine_ops *ops,

                 void *data)

{

    int ret;



    stp_spin_lock(&utrace->lock);



    ret = -EEXIST;

    if ((flags & UTRACE_ATTACH_EXCLUSIVE) &&

         unlikely(find_matching_engine(utrace, flags, ops, data)))

        goto unlock;



    /*

     * In case we had no engines before, make sure that

     * utrace_flags is not zero. Since we did unlock+lock

     * at least once after utrace_task_alloc() installed

     * ->utrace, we have the necessary barrier which pairs

     * with rmb() in task_utrace_struct().

     */

    ret = -ESRCH;

    /* FIXME: Hmm, no reap in the brave new world... */

    if (!utrace->utrace_flags) {

        utrace->utrace_flags = UTRACE_EVENT(REAP);

        /*

         * If we race with tracehook_prepare_release_task()

         * make sure that either it sees utrace_flags != 0

         * or we see exit_state == EXIT_DEAD.

         */

        smp_mb();

        if (unlikely(target->exit_state == EXIT_DEAD)) {

            utrace->utrace_flags = 0;

            goto unlock;

        }

    }



    /*

     * Put the new engine on the pending ->attaching list.

     * Make sure it gets onto the ->attached list by the next

     * time it's examined.  Setting ->pending_attach ensures

     * that start_report() takes the lock and splices the lists

     * before the next new reporting pass.

     *

     * When target == current, it would be safe just to call

     * splice_attaching() right here.  But if we're inside a

     * callback, that would mean the new engine also gets

     * notified about the event that precipitated its own

     * creation.  This is not what the user wants.

     */

    list_add_tail(&engine->entry, &utrace->attaching);

    utrace->pending_attach = 1;

    utrace_engine_get(engine);

    ret = 0;

unlock:

    stp_spin_unlock(&utrace->lock);



    return ret;

}



/**

 * utrace_attach_task - attach new engine, or look up an attached engine

 * @target:    thread to attach to

 * @flags:    flag bits combined with OR, see below

 * @ops:    callback table for new engine

 * @data:    engine private data pointer

 *

 * The caller must ensure that the @target thread does not get freed,

 * i.e. hold a ref or be its parent.  It is always safe to call this

 * on @current, or on the @child pointer in a @report_clone callback.

 *

 * UTRACE_ATTACH_CREATE:

 * Create a new engine.  If %UTRACE_ATTACH_CREATE is not specified, you

 * only look up an existing engine already attached to the thread.

 *

 * *** FIXME: needed??? ***

 * UTRACE_ATTACH_EXCLUSIVE:

 * Attempting to attach a second (matching) engine fails with -%EEXIST.

 *

 * UTRACE_ATTACH_MATCH_OPS: Only consider engines matching @ops.

 * UTRACE_ATTACH_MATCH_DATA: Only consider engines matching @data.

 *

 * *** FIXME: need exclusive processing??? ***

 * Calls with neither %UTRACE_ATTACH_MATCH_OPS nor %UTRACE_ATTACH_MATCH_DATA

 * match the first among any engines attached to @target.  That means that

 * %UTRACE_ATTACH_EXCLUSIVE in such a call fails with -%EEXIST if there

 * are any engines on @target at all.

 */

‌static struct utrace_engine *utrace_attach_task(

    struct task_struct *target, int flags,

    const struct utrace_engine_ops *ops, void *data)

{

    struct utrace *utrace = task_utrace_struct(target);

    struct utrace_engine *engine;

    int ret;



#ifdef STP_TF_DEBUG

    printk(KERN_ERR "%s:%d - target %p, utrace %p\n", __FUNCTION__, __LINE__,

           target, utrace);

#endif



    if (!(flags & UTRACE_ATTACH_CREATE)) {

        if (unlikely(!utrace))

            return ERR_PTR(-ENOENT);

        stp_spin_lock(&utrace->lock);

        engine = find_matching_engine(utrace, flags, ops, data);

        if (engine)

            utrace_engine_get(engine);

        stp_spin_unlock(&utrace->lock);

        return engine ?: ERR_PTR(-ENOENT);

    }



    if (unlikely(!ops) || unlikely(ops == &utrace_detached_ops))

        return ERR_PTR(-EINVAL);



    if (unlikely(target->flags & PF_KTHREAD))

        /*

         * Silly kernel, utrace is for users!

         */

        return ERR_PTR(-EPERM);



    if (!utrace) {

        if (unlikely(!utrace_task_alloc(target)))

            return ERR_PTR(-ENOMEM);

        utrace = task_utrace_struct(target);

    }



    engine = kmem_cache_alloc(utrace_engine_cachep, GFP_IOFS);

    if (unlikely(!engine))

        return ERR_PTR(-ENOMEM);



    /*

     * Initialize the new engine structure.  It starts out with one ref

     * to return.  utrace_add_engine() adds another for being attached.

     */

    kref_init(&engine->kref);

    engine->flags = 0;

    engine->ops = ops;

    engine->data = data;

    engine->release = ops->release;



    ret = utrace_add_engine(target, utrace, engine, flags, ops, data);



    if (unlikely(ret)) {

        kmem_cache_free(utrace_engine_cachep, engine);

        engine = ERR_PTR(ret);

    }





    return engine;

}



/*

 * When an engine is detached, the target thread may still see it and

 * make callbacks until it quiesces.  We install a special ops vector

 * with these two callbacks.  When the target thread quiesces, it can

 * safely free the engine itself.  For any event we will always get

 * the report_quiesce() callback first, so we only need this one

 * pointer to be set.  The only exception is report_reap(), so we

 * supply that callback too.

 */

‌static u32 utrace_detached_quiesce(u32 action, struct utrace_engine *engine,

                   unsigned long event)

{

    return UTRACE_DETACH;

}



‌static void utrace_detached_reap(struct utrace_engine *engine,

                 struct task_struct *task)

{

}



‌static const struct utrace_engine_ops utrace_detached_ops = {

    .report_quiesce = &utrace_detached_quiesce,

    .report_reap = &utrace_detached_reap

};



/*

 * The caller has to hold a ref on the engine.  If the attached flag is

 * true (all but utrace_barrier() calls), the engine is supposed to be

 * attached.  If the attached flag is false (utrace_barrier() only),

 * then return -ERESTARTSYS for an engine marked for detach but not yet

 * fully detached.  The task pointer can be invalid if the engine is

 * detached.

 *

 * Get the utrace lock for the target task.

 * Returns the struct if locked, or ERR_PTR(-errno).

 *

 * This has to be robust against races with:

 *    utrace_control(target, UTRACE_DETACH) calls

 *    UTRACE_DETACH after reports

 *    utrace_report_death

 *    utrace_release_task

 */

static struct utrace *get_utrace_lock(struct task_struct *target,

                      struct utrace_engine *engine,

                      bool attached)

    __acquires(utrace->lock)

{

    struct utrace *utrace;



    rcu_read_lock();



    /*

     * If this engine was already detached, bail out before we look at

     * the task_struct pointer at all.  If it's detached after this

     * check, then RCU is still keeping this task_struct pointer valid.

     *

     * The ops pointer is NULL when the engine is fully detached.

     * It's &utrace_detached_ops when it's marked detached but still

     * on the list.  In the latter case, utrace_barrier() still works,

     * since the target might be in the middle of an old callback.

     */

    if (unlikely(!engine->ops)) {

        rcu_read_unlock();

        return ERR_PTR(-ESRCH);

    }



    if (unlikely(engine->ops == &utrace_detached_ops)) {

        rcu_read_unlock();

        return attached ? ERR_PTR(-ESRCH) : ERR_PTR(-ERESTARTSYS);

    }



    utrace = task_utrace_struct(target);

    stp_spin_lock(&utrace->lock);

    if (unlikely(utrace->reap) || unlikely(!engine->ops) ||

        unlikely(engine->ops == &utrace_detached_ops)) {

        /*

         * By the time we got the utrace lock,

         * it had been reaped or detached already.

         */

        stp_spin_unlock(&utrace->lock);

        utrace = ERR_PTR(-ESRCH);

        if (!attached && engine->ops == &utrace_detached_ops)

            utrace = ERR_PTR(-ERESTARTSYS);

    }

    rcu_read_unlock();



    return utrace;

}



/*

 * Now that we don't hold any locks, run through any

 * detached engines and free their references.  Each

 * engine had one implicit ref while it was attached.

 */

‌static void put_detached_list(struct list_head *list)

{

    struct utrace_engine *engine, *next;

    list_for_each_entry_safe(engine, next, list, entry) {

        list_del_init(&engine->entry);

        utrace_engine_put(engine);

    }

}



/*

 * We use an extra bit in utrace_engine.flags past the event bits,

 * to record whether the engine is keeping the target thread stopped.

 *

 * This bit is set in task_struct.utrace_flags whenever it is set in any

 * engine's flags.  Only utrace_reset() resets it in utrace_flags.

 */

‌#define ENGINE_STOP        (1UL << _UTRACE_NEVENTS)



‌static void mark_engine_wants_stop(struct utrace *utrace,

                   struct utrace_engine *engine)

{

    engine->flags |= ENGINE_STOP;

    utrace->utrace_flags |= ENGINE_STOP;

}



‌static void clear_engine_wants_stop(struct utrace_engine *engine)

{

    engine->flags &= ~ENGINE_STOP;

}



‌static bool engine_wants_stop(struct utrace_engine *engine)

{

    return (engine->flags & ENGINE_STOP) != 0;

}



/**

 * utrace_set_events - choose which event reports a tracing engine gets

 * @target:        thread to affect

 * @engine:        attached engine to affect

 * @events:        new event mask

 *

 * This changes the set of events for which @engine wants callbacks made.

 *

 * This fails with -%EALREADY and does nothing if you try to clear

 * %UTRACE_EVENT(%DEATH) when the @report_death callback may already have

 * begun, or if you try to newly set %UTRACE_EVENT(%DEATH) or

 * %UTRACE_EVENT(%QUIESCE) when @target is already dead or dying.

 *

 * This fails with -%ESRCH if you try to clear %UTRACE_EVENT(%REAP) when

 * the @report_reap callback may already have begun, or when @target has

 * already been detached, including forcible detach on reaping.

 *

 * If @target was stopped before the call, then after a successful call,

 * no event callbacks not requested in @events will be made; if

 * %UTRACE_EVENT(%QUIESCE) is included in @events, then a

 * @report_quiesce callback will be made when @target resumes.

 *

 * If @target was not stopped and @events excludes some bits that were

 * set before, this can return -%EINPROGRESS to indicate that @target

 * may have been making some callback to @engine.  When this returns

 * zero, you can be sure that no event callbacks you've disabled in

 * @events can be made.  If @events only sets new bits that were not set

 * before on @engine, then -%EINPROGRESS will never be returned.

 *

 * To synchronize after an -%EINPROGRESS return, see utrace_barrier().

 *

 * When @target is @current, -%EINPROGRESS is not returned.  But note

 * that a newly-created engine will not receive any callbacks related to

 * an event notification already in progress.  This call enables @events

 * callbacks to be made as soon as @engine becomes eligible for any

 * callbacks, see utrace_attach_task().

 *

 * These rules provide for coherent synchronization based on %UTRACE_STOP,

 * even when %SIGKILL is breaking its normal simple rules.

 */

‌static int utrace_set_events(struct task_struct *target,

                 struct utrace_engine *engine,

                 unsigned long events)

{

    struct utrace *utrace;

    unsigned long old_flags, old_utrace_flags;

    int ret = -EALREADY;



    /*

     * We just ignore the internal bit, so callers can use

     * engine->flags to seed bitwise ops for our argument.

     */

    events &= ~ENGINE_STOP;



    utrace = get_utrace_lock(target, engine, true);

    if (unlikely(IS_ERR(utrace)))

        return PTR_ERR(utrace);



    old_utrace_flags = utrace->utrace_flags;

    old_flags = engine->flags & ~ENGINE_STOP;



    /*

     * If utrace_report_death() is already progress now,

     * it's too late to clear the death event bits.

     */

    if (target->exit_state &&

        (((events & ~old_flags) & _UTRACE_DEATH_EVENTS) ||

         (utrace->death &&

          ((old_flags & ~events) & _UTRACE_DEATH_EVENTS)) ||

         (utrace->reap && ((old_flags & ~events) & UTRACE_EVENT(REAP)))))

        goto unlock;



    /*

     * When setting these flags, it's essential that we really

     * synchronize with exit_notify().  They cannot be set after

     * exit_notify() takes the tasklist_lock.  By holding the read

     * lock here while setting the flags, we ensure that the calls

     * to tracehook_notify_death() and tracehook_report_death() will

     * see the new flags.  This ensures that utrace_release_task()

     * knows positively that utrace_report_death() will be called or

     * that it won't.

     */

    if ((events & ~old_flags) & _UTRACE_DEATH_EVENTS) {

        /* FIXME: we can't get the tasklist_lock (since it

         * isn't exported).  Plus, there is no more tracehook

         * in exit_notify().  So, we'll ignore this for now

         * and just assume that the lock on utrace is

         * enough.  */

        //read_lock(&tasklist_lock);

        if (unlikely(target->exit_state)) {

            //read_unlock(&tasklist_lock);

            goto unlock;

        }

        utrace->utrace_flags |= events;

        //read_unlock(&tasklist_lock);

    }



    engine->flags = events | (engine->flags & ENGINE_STOP);

    utrace->utrace_flags |= events;



    ret = 0;

    if ((old_flags & ~events) && target != current &&

        !task_is_stopped_or_traced(target) && !target->exit_state) {

        /*

         * This barrier ensures that our engine->flags changes

         * have hit before we examine utrace->reporting,

         * pairing with the barrier in start_callback().  If

         * @target has not yet hit finish_callback() to clear

         * utrace->reporting, we might be in the middle of a

         * callback to @engine.

         */

        smp_mb();

        if (utrace->reporting == engine)

            ret = -EINPROGRESS;

    }

unlock:

    stp_spin_unlock(&utrace->lock);



    return ret;

}



/*

 * Asynchronously mark an engine as being detached.

 *

 * This must work while the target thread races with us doing

 * start_callback(), defined below.  It uses smp_rmb() between checking

 * @engine->flags and using @engine->ops.  Here we change @engine->ops

 * first, then use smp_wmb() before changing @engine->flags.  This ensures

 * it can check the old flags before using the old ops, or check the old

 * flags before using the new ops, or check the new flags before using the

 * new ops, but can never check the new flags before using the old ops.

 * Hence, utrace_detached_ops might be used with any old flags in place.

 * It has report_quiesce() and report_reap() callbacks to handle all cases.

 */

‌static void mark_engine_detached(struct utrace_engine *engine)

{

    engine->ops = &utrace_detached_ops;

    smp_wmb();

    engine->flags = UTRACE_EVENT(QUIESCE);

}



/*

 * Get @target to stop and return true if it is already stopped now.

 * If we return false, it will make some event callback soonish.

 * Called with @utrace locked.

 */

‌static bool utrace_do_stop(struct task_struct *target, struct utrace *utrace)

{

    if (task_is_stopped(target)) {

        /*

         * Stopped is considered quiescent; when it wakes up, it will

         * go through utrace_finish_stop() before doing anything else.

         */

        spin_lock_irq(&target->sighand->siglock);

        if (likely(task_is_stopped(target)))

            __set_task_state(target, TASK_TRACED);

        spin_unlock_irq(&target->sighand->siglock);

    } else if (utrace->resume > UTRACE_REPORT) {

        utrace->resume = UTRACE_REPORT;

        stp_task_notify_resume(target, utrace);

    }



    return task_is_traced(target);

}



/*

 * If the target is not dead it should not be in tracing

 * stop any more.  Wake it unless it's in job control stop.

 */

‌static void utrace_wakeup(struct task_struct *target, struct utrace *utrace)

{

    lockdep_assert_held(&utrace->lock);

    spin_lock_irq(&target->sighand->siglock);

    if (target->signal->flags & SIGNAL_STOP_STOPPED ||

        target->signal->group_stop_count)

        target->state = TASK_STOPPED;

    else

        stp_wake_up_state(target, __TASK_TRACED);

    spin_unlock_irq(&target->sighand->siglock);

}



/*

 * This is called when there might be some detached engines on the list or

 * some stale bits in @task->utrace_flags.  Clean them up and recompute the

 * flags.  Returns true if we're now fully detached.

 *

 * Called with @utrace->lock held, returns with it released.

 * After this returns, @utrace might be freed if everything detached.

 */

static bool utrace_reset(struct task_struct *task, struct utrace *utrace)

    __releases(utrace->lock)

{

    struct utrace_engine *engine, *next;

    unsigned long flags = 0;

    LIST_HEAD(detached);



    splice_attaching(utrace);



    /*

     * Update the set of events of interest from the union

     * of the interests of the remaining tracing engines.

     * For any engine marked detached, remove it from the list.

     * We'll collect them on the detached list.

     */

    list_for_each_entry_safe(engine, next, &utrace->attached, entry) {

        if (engine->ops == &utrace_detached_ops) {

            engine->ops = NULL;

            list_move(&engine->entry, &detached);

        } else {

            flags |= engine->flags | UTRACE_EVENT(REAP);

        }

    }



    if (task->exit_state) {

        /*

         * Once it's already dead, we never install any flags

         * except REAP.  When ->exit_state is set and events

         * like DEATH are not set, then they never can be set.

         * This ensures that utrace_release_task() knows

         * positively that utrace_report_death() can never run.

         */

        BUG_ON(utrace->death);

        flags &= UTRACE_EVENT(REAP);

    }



    if (!flags) {

        /*

         * No more engines, cleared out the utrace.

         */

        utrace->resume = UTRACE_RESUME;

    }



    /*

     * If no more engines want it stopped, wake it up.

     */

    if (task_is_traced(task) && !(flags & ENGINE_STOP))

        utrace_wakeup(task, utrace);



    /*

     * In theory spin_lock() doesn't imply rcu_read_lock().

     * Once we clear ->utrace_flags this task_struct can go away

     * because tracehook_prepare_release_task() path does not take

     * utrace->lock when ->utrace_flags == 0.

     */

    rcu_read_lock();

    utrace->utrace_flags = flags;

    stp_spin_unlock(&utrace->lock);

    rcu_read_unlock();



    put_detached_list(&detached);



    return !flags;

}



‌static void utrace_finish_stop(void)

{

    /*

     * If we were task_is_traced() and then SIGKILL'ed, make

     * sure we do nothing until the tracer drops utrace->lock.

     */

    if (unlikely(__fatal_signal_pending(current))) {

        struct utrace *utrace = task_utrace_struct(current);

        stp_spin_unlock_wait(&utrace->lock);

    }

}



/*

 * Perform %UTRACE_STOP, i.e. block in TASK_TRACED until woken up.

 * @task == current, @utrace == current->utrace, which is not locked.

 * Return true if we were woken up by SIGKILL even though some utrace

 * engine may still want us to stay stopped.

 */

‌static void utrace_stop(struct task_struct *task, struct utrace *utrace,

            enum utrace_resume_action action)

{

relock:

    stp_spin_lock(&utrace->lock);



    if (action < utrace->resume) {

        /*

         * Ensure a reporting pass when we're resumed.

         */

        utrace->resume = action;

        stp_task_notify_resume(task, utrace);

        if (action == UTRACE_INTERRUPT)

            set_thread_flag(TIF_SIGPENDING);

    }



    /*

     * If the ENGINE_STOP bit is clear in utrace_flags, that means

     * utrace_reset() ran after we processed some UTRACE_STOP return

     * values from callbacks to get here.  If all engines have detached

     * or resumed us, we don't stop.  This check doesn't require

     * siglock, but it should follow the interrupt/report bookkeeping

     * steps (this can matter for UTRACE_RESUME but not UTRACE_DETACH).

     */

    if (unlikely(!(utrace->utrace_flags & ENGINE_STOP))) {

        utrace_reset(task, utrace);

        if (utrace->utrace_flags & ENGINE_STOP)

            goto relock;

        return;

    }



    /*

     * The siglock protects us against signals.  As well as SIGKILL

     * waking us up, we must synchronize with the signal bookkeeping

     * for stop signals and SIGCONT.

     */

    spin_lock_irq(&task->sighand->siglock);



    if (unlikely(__fatal_signal_pending(task))) {

        spin_unlock_irq(&task->sighand->siglock);

        stp_spin_unlock(&utrace->lock);

        return;

    }



    __set_current_state(TASK_TRACED);



    /*

     * If there is a group stop in progress,

     * we must participate in the bookkeeping.

     */

    if (unlikely(task->signal->group_stop_count) &&

            !--task->signal->group_stop_count)

        task->signal->flags = SIGNAL_STOP_STOPPED;



    spin_unlock_irq(&task->sighand->siglock);

    stp_spin_unlock(&utrace->lock);



    schedule();



    utrace_finish_stop();



    /*

     * While in TASK_TRACED, we were considered "frozen enough".

     * Now that we woke up, it's crucial if we're supposed to be

     * frozen that we freeze now before running anything substantial.

     */

    try_to_freeze();



    /*

     * While we were in TASK_TRACED, complete_signal() considered

     * us "uninterested" in signal wakeups.  Now make sure our

     * TIF_SIGPENDING state is correct for normal running.

     */

    spin_lock_irq(&task->sighand->siglock);

    recalc_sigpending();

    spin_unlock_irq(&task->sighand->siglock);

}



/*

 * Called by release_task() with @reap set to true.

 * Called by utrace_report_death() with @reap set to false.

 * On reap, make report_reap callbacks and clean out @utrace

 * unless still making callbacks.  On death, update bookkeeping

 * and handle the reap work if release_task() came in first.

 */

‌static void utrace_maybe_reap(struct task_struct *target, struct utrace *utrace,

                  bool reap)

{

    struct utrace_engine *engine, *next;

    struct list_head attached;



    stp_spin_lock(&utrace->lock);



    if (reap) {

        /*

         * If the target will do some final callbacks but hasn't

         * finished them yet, we know because it clears these event

         * bits after it's done.  Instead of cleaning up here and

         * requiring utrace_report_death() to cope with it, we

         * delay the REAP report and the teardown until after the

         * target finishes its death reports.

         */

        utrace->reap = 1;



        if (utrace->utrace_flags & _UTRACE_DEATH_EVENTS) {

            stp_spin_unlock(&utrace->lock);

            return;

        }

    } else {

        /*

         * After we unlock with this flag clear, any competing

         * utrace_control/utrace_set_events calls know that we've

         * finished our callbacks and any detach bookkeeping.

         */

        utrace->death = 0;



        if (!utrace->reap) {

            /*

             * We're just dead, not reaped yet.  This will

             * reset @target->utrace_flags so the later call

             * with @reap set won't hit the check above.

             */

            utrace_reset(target, utrace);

            return;

        }

    }



    /*

     * utrace_add_engine() checks ->utrace_flags != 0.  Since

     * @utrace->reap is set, nobody can set or clear UTRACE_EVENT(REAP)

     * in @engine->flags or change @engine->ops and nobody can change

     * @utrace->attached after we drop the lock.

     */

    utrace->utrace_flags = 0;



    /*

     * We clear out @utrace->attached before we drop the lock so

     * that find_matching_engine() can't come across any old engine

     * while we are busy tearing it down.

     */

    list_replace_init(&utrace->attached, &attached);

    list_splice_tail_init(&utrace->attaching, &attached);



    stp_spin_unlock(&utrace->lock);



    list_for_each_entry_safe(engine, next, &attached, entry) {

        if (engine->flags & UTRACE_EVENT(REAP))

            engine->ops->report_reap(engine, target);



        engine->ops = NULL;

        engine->flags = 0;

        list_del_init(&engine->entry);



        utrace_engine_put(engine);

    }

}



/*

 * You can't do anything to a dead task but detach it.

 * If release_task() has been called, you can't do that.

 *

 * On the exit path, DEATH and QUIESCE event bits are set only

 * before utrace_report_death() has taken the lock.  At that point,

 * the death report will come soon, so disallow detach until it's

 * done.  This prevents us from racing with it detaching itself.

 *

 * Called only when @target->exit_state is nonzero.

 */

‌static inline int utrace_control_dead(struct task_struct *target,

                      struct utrace *utrace,

                      enum utrace_resume_action action)

{

    lockdep_assert_held(&utrace->lock);



    if (action != UTRACE_DETACH || unlikely(utrace->reap))

        return -ESRCH;



    if (unlikely(utrace->death))

        /*

         * We have already started the death report.  We can't

         * prevent the report_death and report_reap callbacks,

         * so tell the caller they will happen.

         */

        return -EALREADY;



    return 0;

}



/**

 * utrace_control - control a thread being traced by a tracing engine

 * @target:        thread to affect

 * @engine:        attached engine to affect

 * @action:        &enum utrace_resume_action for thread to do

 *

 * This is how a tracing engine asks a traced thread to do something.

 * This call is controlled by the @action argument, which has the

 * same meaning as the &enum utrace_resume_action value returned by

 * event reporting callbacks.

 *

 * If @target is already dead (@target->exit_state nonzero),

 * all actions except %UTRACE_DETACH fail with -%ESRCH.

 *

 * The following sections describe each option for the @action argument.

 *

 * UTRACE_DETACH:

 *

 * After this, the @engine data structure is no longer accessible,

 * and the thread might be reaped.  The thread will start running

 * again if it was stopped and no longer has any attached engines

 * that want it stopped.

 *

 * If the @report_reap callback may already have begun, this fails

 * with -%ESRCH.  If the @report_death callback may already have

 * begun, this fails with -%EALREADY.

 *

 * If @target is not already stopped, then a callback to this engine

 * might be in progress or about to start on another CPU.  If so,

 * then this returns -%EINPROGRESS; the detach happens as soon as

 * the pending callback is finished.  To synchronize after an

 * -%EINPROGRESS return, see utrace_barrier().

 *

 * If @target is properly stopped before utrace_control() is called,

 * then after successful return it's guaranteed that no more callbacks

 * to the @engine->ops vector will be made.

 *

 * The only exception is %SIGKILL (and exec or group-exit by another

 * thread in the group), which can cause asynchronous @report_death

 * and/or @report_reap callbacks even when %UTRACE_STOP was used.

 * (In that event, this fails with -%ESRCH or -%EALREADY, see above.)

 *

 * UTRACE_STOP:

 *

 * This asks that @target stop running.  This returns 0 only if

 * @target is already stopped, either for tracing or for job

 * control.  Then @target will remain stopped until another

 * utrace_control() call is made on @engine; @target can be woken

 * only by %SIGKILL (or equivalent, such as exec or termination by

 * another thread in the same thread group).

 *

 * This returns -%EINPROGRESS if @target is not already stopped.

 * Then the effect is like %UTRACE_REPORT.  A @report_quiesce

 * callback will be made soon.  Your callback can

 * then return %UTRACE_STOP to keep @target stopped.

 *

 * This does not interrupt system calls in progress, including ones

 * that sleep for a long time.

 *

 * UTRACE_RESUME:

 *

 * Just let @target continue running normally, reversing the effect

 * of a previous %UTRACE_STOP.  If another engine is keeping @target

 * stopped, then it remains stopped until all engines let it resume.

 * If @target was not stopped, this has no effect.

 *

 * UTRACE_REPORT:

 *

 * This is like %UTRACE_RESUME, but also ensures that there will be

 * a @report_quiesce callback made soon.  If

 * @target had been stopped, then there will be a callback before it

 * resumes running normally.  If another engine is keeping @target

 * stopped, then there might be no callbacks until all engines let

 * it resume.

 *

 * Since this is meaningless unless @report_quiesce callbacks will

 * be made, it returns -%EINVAL if @engine lacks %UTRACE_EVENT(%QUIESCE).

 *

 * UTRACE_INTERRUPT:

 *

 * This is like %UTRACE_REPORT, but ensures that @target will make a

 * callback before it resumes or delivers signals.  If @target was in

 * a system call or about to enter one, work in progress will be

 * interrupted as if by %SIGSTOP.  If another engine is keeping

 * @target stopped, then there might be no callbacks until all engines

 * let it resume.

 */

‌static int utrace_control(struct task_struct *target,

              struct utrace_engine *engine,

              enum utrace_resume_action action)

{

    struct utrace *utrace;

    bool reset;

    int ret;



    if (unlikely(action >= UTRACE_RESUME_MAX)) {

        WARN(1, "invalid action argument to utrace_control()!");

        return -EINVAL;

    }



    /*

     * This is a sanity check for a programming error in the caller.

     * Their request can only work properly in all cases by relying on

     * a follow-up callback, but they didn't set one up!  This check

     * doesn't do locking, but it shouldn't matter.  The caller has to

     * be synchronously sure the callback is set up to be operating the

     * interface properly.

     */

    if (action >= UTRACE_REPORT && action < UTRACE_RESUME &&

        unlikely(!(engine->flags & UTRACE_EVENT(QUIESCE)))) {

        WARN(1, "utrace_control() with no QUIESCE callback in place!");

        return -EINVAL;

    }



    utrace = get_utrace_lock(target, engine, true);

    if (unlikely(IS_ERR(utrace)))

        return PTR_ERR(utrace);



    reset = task_is_traced(target);

    ret = 0;



    /*

     * ->exit_state can change under us, this doesn't matter.

     * We do not care about ->exit_state in fact, but we do

     * care about ->reap and ->death. If either flag is set,

     * we must also see ->exit_state != 0.

     */

    if (unlikely(target->exit_state)) {

        ret = utrace_control_dead(target, utrace, action);

        if (ret) {

            stp_spin_unlock(&utrace->lock);

            return ret;

        }

        reset = true;

    }



    switch (action) {

    case UTRACE_STOP:

        mark_engine_wants_stop(utrace, engine);

        if (!reset && !utrace_do_stop(target, utrace))

            ret = -EINPROGRESS;

        reset = false;

        break;



    case UTRACE_DETACH:

        if (engine_wants_stop(engine))

            utrace->utrace_flags &= ~ENGINE_STOP;

        mark_engine_detached(engine);

        reset = reset || utrace_do_stop(target, utrace);

        if (!reset) {

            /*

             * As in utrace_set_events(), this barrier ensures

             * that our engine->flags changes have hit before we

             * examine utrace->reporting, pairing with the barrier

             * in start_callback().  If @target has not yet hit

             * finish_callback() to clear utrace->reporting, we

             * might be in the middle of a callback to @engine.

             */

            smp_mb();

            if (utrace->reporting == engine)

                ret = -EINPROGRESS;

        }

        break;



    case UTRACE_RESUME:

        clear_engine_wants_stop(engine);

        break;



    case UTRACE_REPORT:

        /*

         * Make the thread call tracehook_notify_resume() soon.

         * But don't bother if it's already been interrupted.

         * In that case, utrace_get_signal() will be reporting soon.

         */

        clear_engine_wants_stop(engine);

        if (action < utrace->resume) {

            utrace->resume = action;

            stp_task_notify_resume(target, utrace);

        }

        break;



    case UTRACE_INTERRUPT:

        /*

         * Make the thread call tracehook_get_signal() soon.

         */

        clear_engine_wants_stop(engine);

        if (utrace->resume == UTRACE_INTERRUPT)

            break;

        utrace->resume = UTRACE_INTERRUPT;



        /*

         * If it's not already stopped, interrupt it now.  We need

         * the siglock here in case it calls recalc_sigpending()

         * and clears its own TIF_SIGPENDING.  By taking the lock,

         * we've serialized any later recalc_sigpending() after our

         * setting of utrace->resume to force it on.

         */

        stp_task_notify_resume(target, utrace);

        if (reset) {

            /*

             * This is really just to keep the invariant that

             * TIF_SIGPENDING is set with UTRACE_INTERRUPT.

             * When it's stopped, we know it's always going

             * through utrace_get_signal() and will recalculate.

             */

            set_tsk_thread_flag(target, TIF_SIGPENDING);

        } else {

            struct sighand_struct *sighand;

            unsigned long irqflags;

            sighand = stp_lock_task_sighand(target, &irqflags);

            if (likely(sighand)) {

                stp_signal_wake_up(target, 0);

                unlock_task_sighand(target, &irqflags);

            }

        }

        break;



    default:

        BUG();        /* We checked it on entry.  */

    }



    /*

     * Let the thread resume running.  If it's not stopped now,

     * there is nothing more we need to do.

     */

    if (reset)

        utrace_reset(target, utrace);

    else

        stp_spin_unlock(&utrace->lock);



    return ret;

}



/**

 * utrace_barrier - synchronize with simultaneous tracing callbacks

 * @target:        thread to affect

 * @engine:        engine to affect (can be detached)

 *

 * This blocks while @target might be in the midst of making a callback to

 * @engine.  It can be interrupted by signals and will return -%ERESTARTSYS.

 * A return value of zero means no callback from @target to @engine was

 * in progress.  Any effect of its return value (such as %UTRACE_STOP) has

 * already been applied to @engine.

 *

 * It's not necessary to keep the @target pointer alive for this call.

 * It's only necessary to hold a ref on @engine.  This will return

 * safely even if @target has been reaped and has no task refs.

 *

 * A successful return from utrace_barrier() guarantees its ordering

 * with respect to utrace_set_events() and utrace_control() calls.  If

 * @target was not properly stopped, event callbacks just disabled might

 * still be in progress; utrace_barrier() waits until there is no chance

 * an unwanted callback can be in progress.

 */

‌static int utrace_barrier(struct task_struct *target,

              struct utrace_engine *engine)

{

    struct utrace *utrace;

    int ret = -ERESTARTSYS;



    if (unlikely(target == current))

        return 0;



    /* If we get here, we might call

     * schedule_timeout_interruptible(), which sleeps. */

    might_sleep();

    do {

        utrace = get_utrace_lock(target, engine, false);

        if (unlikely(IS_ERR(utrace))) {

            ret = PTR_ERR(utrace);

            if (ret != -ERESTARTSYS)

                break;

        } else {

            /*

             * All engine state changes are done while

             * holding the lock, i.e. before we get here.

             * Since we have the lock, we only need to

             * worry about @target making a callback.

             * When it has entered start_callback() but

             * not yet gotten to finish_callback(), we

             * will see utrace->reporting == @engine.

             * When @target doesn't take the lock, it uses

             * barriers to order setting utrace->reporting

             * before it examines the engine state.

             */

            if (utrace->reporting != engine)

                ret = 0;

            stp_spin_unlock(&utrace->lock);

            if (!ret)

                break;

        }

        schedule_timeout_interruptible(1);

    } while (!signal_pending(current));



    return ret;

}



/*

 * This is local state used for reporting loops, perhaps optimized away.

 */

‌struct utrace_report {

    u32 result;

    enum utrace_resume_action action;

    enum utrace_resume_action resume_action;

    bool detaches;

    bool spurious;

};



‌#define INIT_REPORT(var)            \

    struct utrace_report var = {        \

        .action = UTRACE_RESUME,    \

        .resume_action = UTRACE_RESUME,    \

        .spurious = true         \

    }



/*

 * We are now making the report, so clear the flag saying we need one.

 * When there is a new attach, ->pending_attach is set just so we will

 * know to do splice_attaching() here before the callback loop.

 */

‌static enum utrace_resume_action start_report(struct utrace *utrace)

{

    enum utrace_resume_action resume = utrace->resume;

    if (utrace->pending_attach ||

        (resume > UTRACE_STOP && resume < UTRACE_RESUME)) {

        stp_spin_lock(&utrace->lock);

        splice_attaching(utrace);

        resume = utrace->resume;

        if (resume > UTRACE_STOP)

            utrace->resume = UTRACE_RESUME;

        stp_spin_unlock(&utrace->lock);

    }

    return resume;

}



‌static inline void finish_report_reset(struct task_struct *task,

                       struct utrace *utrace,

                       struct utrace_report *report)

{

    if (unlikely(report->spurious || report->detaches)) {

        stp_spin_lock(&utrace->lock);

        if (utrace_reset(task, utrace))

            report->action = UTRACE_RESUME;

    }

}



/*

 * Complete a normal reporting pass, pairing with a start_report()

 * call.  This handles any UTRACE_DETACH or UTRACE_REPORT returns from

 * engine callbacks.  If @will_not_stop is true and any engine's last

 * callback used UTRACE_STOP, we do UTRACE_REPORT here to ensure we

 * stop before user mode.  If there were no callbacks made, it will

 * recompute @task->utrace_flags to avoid another false-positive.

 */

‌static void finish_report(struct task_struct *task, struct utrace *utrace,

              struct utrace_report *report, bool will_not_stop)

{

    enum utrace_resume_action resume = report->action;



    if (resume == UTRACE_STOP)

        resume = will_not_stop ? UTRACE_REPORT : UTRACE_RESUME;



    if (resume < utrace->resume) {

        stp_spin_lock(&utrace->lock);

        utrace->resume = resume;

        stp_task_notify_resume(task, utrace);

        if (resume == UTRACE_INTERRUPT)

            set_tsk_thread_flag(task, TIF_SIGPENDING);

        stp_spin_unlock(&utrace->lock);

    }



    finish_report_reset(task, utrace, report);

}



‌static void finish_callback_report(struct task_struct *task,

                   struct utrace *utrace,

                   struct utrace_report *report,

                   struct utrace_engine *engine,

                   enum utrace_resume_action action)

{

    if (action == UTRACE_DETACH) {

        /*

         * By holding the lock here, we make sure that

         * utrace_barrier() (really get_utrace_lock()) sees the

         * effect of this detach.  Otherwise utrace_barrier() could

         * return 0 after this callback had returned UTRACE_DETACH.

         * This way, a 0 return is an unambiguous indicator that any

         * callback returning UTRACE_DETACH has indeed caused detach.

         */

        stp_spin_lock(&utrace->lock);

        engine->ops = &utrace_detached_ops;

        stp_spin_unlock(&utrace->lock);

    }



    /*

     * If utrace_control() was used, treat that like UTRACE_DETACH here.

     */

    if (engine->ops == &utrace_detached_ops) {

        report->detaches = true;

        return;

    }



    if (action < report->action)

        report->action = action;



    if (action != UTRACE_STOP) {

        if (action < report->resume_action)

            report->resume_action = action;



        if (engine_wants_stop(engine)) {

            stp_spin_lock(&utrace->lock);

            clear_engine_wants_stop(engine);

            stp_spin_unlock(&utrace->lock);

        }



        return;

    }



    if (!engine_wants_stop(engine)) {

        stp_spin_lock(&utrace->lock);

        /*

         * If utrace_control() came in and detached us

         * before we got the lock, we must not stop now.

         */

        if (unlikely(engine->ops == &utrace_detached_ops))

            report->detaches = true;

        else

            mark_engine_wants_stop(utrace, engine);

        stp_spin_unlock(&utrace->lock);

    }

}



/*

 * Apply the return value of one engine callback to @report.

 * Returns true if @engine detached and should not get any more callbacks.

 */

‌static bool finish_callback(struct task_struct *task, struct utrace *utrace,

                struct utrace_report *report,

                struct utrace_engine *engine,

                u32 ret)

{

    report->result = ret & ~UTRACE_RESUME_MASK;

    finish_callback_report(task, utrace, report, engine,

                   utrace_resume_action(ret));



    /*

     * Now that we have applied the effect of the return value,

     * clear this so that utrace_barrier() can stop waiting.

     * A subsequent utrace_control() can stop or resume @engine

     * and know this was ordered after its callback's action.

     *

     * We don't need any barriers here because utrace_barrier()

     * takes utrace->lock.  If we touched engine->flags above,

     * the lock guaranteed this change was before utrace_barrier()

     * examined utrace->reporting.

     */

    utrace->reporting = NULL;



    /*

     * We've just done an engine callback.  These are *not*

     * allowed to sleep, unlike the original utrace (since

     * tracepiont handlers aren't allowed to sleep).

     */



    return engine->ops == &utrace_detached_ops;

}



/*

 * Start the callbacks for @engine to consider @event (a bit mask).

 * This makes the report_quiesce() callback first.  If @engine wants

 * a specific callback for @event, we return the ops vector to use.

 * If not, we return NULL.  The return value from the ops->callback

 * function called should be passed to finish_callback().

 */

‌static const struct utrace_engine_ops *start_callback(

    struct utrace *utrace, struct utrace_report *report,

    struct utrace_engine *engine, struct task_struct *task,

    unsigned long event)

{

    const struct utrace_engine_ops *ops;

    unsigned long want;



#ifdef STP_TF_DEBUG

    printk(KERN_ERR "%s:%d - utrace %p, report %p, engine %p, task %p, event %ld\n",

           __FUNCTION__, __LINE__, utrace, report, engine, task, event);

#endif



    /*

     * This barrier ensures that we've set utrace->reporting before

     * we examine engine->flags or engine->ops.  utrace_barrier()

     * relies on this ordering to indicate that the effect of any

     * utrace_control() and utrace_set_events() calls is in place

     * by the time utrace->reporting can be seen to be NULL.

     */

    utrace->reporting = engine;

    smp_mb();



    /*

     * This pairs with the barrier in mark_engine_detached().

     * It makes sure that we never see the old ops vector with

     * the new flags, in case the original vector had no report_quiesce.

     */

    want = engine->flags;

    smp_rmb();

    ops = engine->ops;



    if ((want & UTRACE_EVENT(QUIESCE)) || ops == &utrace_detached_ops) {

#ifdef STP_TF_DEBUG

        printk(KERN_ERR "%s:%d - quiescing, ops %p, ops->report_quiesce %p\n",

               __FUNCTION__, __LINE__, ops,

               (ops == NULL ? 0 : ops->report_quiesce));

#endif

        if (finish_callback(task, utrace, report, engine,

                    (*ops->report_quiesce)(report->action,

                               engine, event)))

            return NULL;



        if (!event) {

            /* We only got here to report QUIESCE */

            report->spurious = false;

            return NULL;

        }



        /*

         * finish_callback() reset utrace->reporting after the

         * quiesce callback.  Now we set it again (as above)

         * before re-examining engine->flags, which could have

         * been changed synchronously by ->report_quiesce or

         * asynchronously by utrace_control() or utrace_set_events().

         */

        utrace->reporting = engine;

        smp_mb();

        want = engine->flags;

    }



    if (want & ENGINE_STOP)

        report->action = UTRACE_STOP;



    if (want & event) {

        report->spurious = false;

        return ops;

    }



    utrace->reporting = NULL;

    return NULL;

}



/*

 * Do a normal reporting pass for engines interested in @event.

 * @callback is the name of the member in the ops vector, and remaining

 * args are the extras it takes after the standard three args.

 */

‌#define REPORT_CALLBACKS(rev, task, utrace, report, event, callback, ...)     \

    do {                                      \

        struct utrace_engine *engine;                      \

        const struct utrace_engine_ops *ops;                  \

        list_for_each_entry##rev(engine, &utrace->attached, entry) {  \

            ops = start_callback(utrace, report, engine, task,    \

                         event);                  \

            if (!ops)                          \

                continue;                      \

            finish_callback(task, utrace, report, engine,          \

                    (*ops->callback)(__VA_ARGS__));          \

        }                                  \

    } while (0)

‌#define REPORT(task, utrace, report, event, callback, ...)              \

    do {                                      \

        start_report(utrace);                          \

        REPORT_CALLBACKS(, task, utrace, report, event, callback,     \

                 (report)->action, engine, ## __VA_ARGS__);   \

        finish_report(task, utrace, report, true);              \

    } while (0)



/*

 * Called iff UTRACE_EVENT(EXEC) flag is set.

 */

‌static void utrace_report_exec(void *cb_data __attribute__ ((unused)),

                   struct task_struct *task,

                   pid_t old_pid __attribute__((unused)),

                   struct linux_binprm *bprm __attribute__ ((unused)))

{

    struct utrace *utrace;



    if (atomic_read(&utrace_state) != __UTRACE_REGISTERED)

        return;

    utrace = task_utrace_struct(task);



    if (utrace && utrace->utrace_flags & UTRACE_EVENT(EXEC)) {

        INIT_REPORT(report);



        /* FIXME: Hmm, can we get regs another way? */

        REPORT(task, utrace, &report, UTRACE_EVENT(EXEC),

               report_exec, NULL, NULL, NULL /* regs */);

    }

}



#if 0

static u32 do_report_syscall_entry(struct pt_regs *regs,

                   struct task_struct *task,

                   struct utrace *utrace,

                   struct utrace_report *report,

                   u32 resume_report)

{

    start_report(utrace);

    REPORT_CALLBACKS(_reverse, task, utrace, report,

             UTRACE_EVENT(SYSCALL_ENTRY), report_syscall_entry,

             resume_report | report->result | report->action,

             engine, regs);

    finish_report(task, utrace, report, false);



    if (report->action != UTRACE_STOP)

        return 0;



    utrace_stop(task, utrace, report->resume_action);



    if (fatal_signal_pending(task)) {

        /*

         * We are continuing despite UTRACE_STOP because of a

         * SIGKILL.  Don't let the system call actually proceed.

         */

        report->result = UTRACE_SYSCALL_ABORT;

    } else if (utrace->resume <= UTRACE_REPORT) {

        /*

         * If we've been asked for another report after our stop,

         * go back to report (and maybe stop) again before we run

         * the system call.  The second (and later) reports are

         * marked with the UTRACE_SYSCALL_RESUMED flag so that

         * engines know this is a second report at the same

         * entry.  This gives them the chance to examine the

         * registers anew after they might have been changed

         * while we were stopped.

         */

        report->detaches = false;

        report->spurious = true;

        report->action = report->resume_action = UTRACE_RESUME;

        return UTRACE_SYSCALL_RESUMED;

    }



    return 0;

}

#endif



/*

 * Called iff UTRACE_EVENT(SYSCALL_ENTRY) flag is set.

 * Return true to prevent the system call.

 */

‌static void utrace_report_syscall_entry(void *cb_data __attribute__ ((unused)),

                    struct pt_regs *regs, long id)

{

    struct task_struct *task = current;

    struct utrace *utrace;



    if (atomic_read(&utrace_state) != __UTRACE_REGISTERED)

        return;

    utrace = task_utrace_struct(task);



    /* FIXME: Is this 100% correct? */

    if (utrace

        && utrace->utrace_flags & (UTRACE_EVENT(SYSCALL_ENTRY)|ENGINE_STOP)) {

        INIT_REPORT(report);





        /* FIXME: Hmm, original utrace called probes in reverse

         * order.  Needed here? */

        REPORT(task, utrace, &report, UTRACE_EVENT(SYSCALL_ENTRY),

               report_syscall_entry, regs);

    }





#if 0

    INIT_REPORT(report);

    u32 resume_report = 0;



    do {

        resume_report = do_report_syscall_entry(regs, task, utrace,

                            &report, resume_report);

    } while (resume_report);



    return utrace_syscall_action(report.result) == UTRACE_SYSCALL_ABORT;

#endif

}



/*

 * Called iff UTRACE_EVENT(SYSCALL_EXIT) flag is set.

 */

‌static void utrace_report_syscall_exit(void *cb_data __attribute__ ((unused)),

                       struct pt_regs *regs, long ret)

{

    struct task_struct *task = current;

    struct utrace *utrace;



    if (atomic_read(&utrace_state) != __UTRACE_REGISTERED)

        return;

    utrace = task_utrace_struct(task);



    /* FIXME: Is this 100% correct? */

    if (utrace

        && utrace->utrace_flags & (UTRACE_EVENT(SYSCALL_EXIT)|ENGINE_STOP)) {

        INIT_REPORT(report);



#ifdef STP_TF_DEBUG

        printk(KERN_ERR "%s:%d - task %p, utrace %p, utrace_flags 0x%lx\n",

               __FUNCTION__, __LINE__, task, utrace,

               utrace->utrace_flags);

#endif

        REPORT(task, utrace, &report, UTRACE_EVENT(SYSCALL_EXIT),

               report_syscall_exit, regs);

    }

}



/*

 * Called iff UTRACE_EVENT(CLONE) flag is set.

 * This notification call blocks the wake_up_new_task call on the child.

 * So we must not quiesce here.  tracehook_report_clone_complete will do

 * a quiescence check momentarily.

 */

‌static void utrace_report_clone(void *cb_data __attribute__ ((unused)),

                struct task_struct *task,

                struct task_struct *child)

{

    struct utrace *utrace;



    if (atomic_read(&utrace_state) != __UTRACE_REGISTERED)

        return;

    utrace = task_utrace_struct(task);



#ifdef STP_TF_DEBUG

    printk(KERN_ERR "%s:%d - parent %p, child %p, current %p\n",

           __FUNCTION__, __LINE__, task, child, current);

#endif



    if (utrace && utrace->utrace_flags & UTRACE_EVENT(CLONE)) {

        unsigned long clone_flags = 0;

        INIT_REPORT(report);



        /* FIXME: Figure out what the clone_flags were. For

         * task_finder's purposes, all we need is CLONE_THREAD. */

        if (task->mm == child->mm)

            clone_flags |= CLONE_VM;

        if (task->fs == child->fs)

            clone_flags |= CLONE_FS;

        if (task->files == child->files)

            clone_flags |= CLONE_FILES;

        if (task->sighand == child->sighand)

            clone_flags |= CLONE_SIGHAND;



#if 0

#define CLONE_PTRACE    0x00002000    /* set if we want to let tracing continue on the child too */

#define CLONE_VFORK    0x00004000    /* set if the parent wants the child to wake it up on mm_release */

#define CLONE_PARENT    0x00008000    /* set if we want to have the same parent as the cloner */

#endif

        if (! thread_group_leader(child)) /* Same thread group? */

            clone_flags |= CLONE_THREAD;



#if 0

#define CLONE_NEWNS    0x00020000    /* New namespace group? */

#define CLONE_SYSVSEM    0x00040000    /* share system V SEM_UNDO semantics */

#define CLONE_SETTLS    0x00080000    /* create a new TLS for the child */

#define CLONE_PARENT_SETTID    0x00100000    /* set the TID in the parent */

#define CLONE_CHILD_CLEARTID    0x00200000    /* clear the TID in the child */

#define CLONE_DETACHED        0x00400000    /* Unused, ignored */

#define CLONE_UNTRACED        0x00800000    /* set if the tracing process can't force CLONE_PTRACE on this clone */

#define CLONE_CHILD_SETTID    0x01000000    /* set the TID in the child */

/* 0x02000000 was previously the unused CLONE_STOPPED (Start in stopped state)

   and is now available for re-use. */

#define CLONE_NEWUTS        0x04000000    /* New utsname group? */

#define CLONE_NEWIPC        0x08000000    /* New ipcs */

#define CLONE_NEWUSER        0x10000000    /* New user namespace */

#define CLONE_NEWPID        0x20000000    /* New pid namespace */

#define CLONE_NEWNET        0x40000000    /* New network namespace */

#define CLONE_IO        0x80000000    /* Clone io context */

#endif





        REPORT(task, utrace, &report, UTRACE_EVENT(CLONE),

               report_clone, clone_flags, child);





#if 0

        /*

         * For a vfork, we will go into an uninterruptible

         * block waiting for the child.  We need UTRACE_STOP

         * to happen before this, not after.  For CLONE_VFORK,

         * utrace_finish_vfork() will be called.

         */

        if (report.action == UTRACE_STOP

            && (clone_flags & CLONE_VFORK)) {

            spin_lock(&utrace->lock);

            utrace->vfork_stop = 1;

            spin_unlock(&utrace->lock);

        }

#endif

    }

}



/*

 * We're called after utrace_report_clone() for a CLONE_VFORK.

 * If UTRACE_STOP was left from the clone report, we stop here.

 * After this, we'll enter the uninterruptible wait_for_completion()

 * waiting for the child.

 */

‌static void utrace_finish_vfork(struct task_struct *task)

{

    struct utrace *utrace = task_utrace_struct(task);



    if (utrace->vfork_stop) {

        stp_spin_lock(&utrace->lock);

        utrace->vfork_stop = 0;

        stp_spin_unlock(&utrace->lock);

        utrace_stop(task, utrace, UTRACE_RESUME); /* XXX */

    }

}



/*

 * Called iff UTRACE_EVENT(DEATH) or UTRACE_EVENT(QUIESCE) flag is set.

 *

 * It is always possible that we are racing with utrace_release_task here.

 * For this reason, utrace_release_task checks for the event bits that get

 * us here, and delays its cleanup for us to do.

 */

‌static void utrace_report_death(void *cb_data __attribute__ ((unused)),

                struct task_struct *task)

{

    struct utrace *utrace;

    INIT_REPORT(report);



    if (atomic_read(&utrace_state) != __UTRACE_REGISTERED)

        return;

    utrace = task_utrace_struct(task);



#ifdef STP_TF_DEBUG

    printk(KERN_ERR "%s:%d - task %p, utrace %p, flags %lx\n", __FUNCTION__, __LINE__, task, utrace, utrace ? utrace->utrace_flags : 0);

#endif

    if (!utrace || !(utrace->utrace_flags & UTRACE_EVENT(DEATH)))

        return;



    /* This code is called from the 'sched_process_exit'

     * tracepoint, which really corresponds more to UTRACE_EXIT

     * (thread exit in progress) than to UTRACE_DEATH (thread has

     * died).  But utrace_report_death() calls

     * utrace_maybe_reap(), which does cleanup that we need.

     *

     * Because of this, 'exit_state' won't be set yet (as it would

     * have been when the original utrace hit this code).

     *

     * BUG_ON(!task->exit_state);

     */



    /*

     * We are presently considered "quiescent"--which is accurate

     * inasmuch as we won't run any more user instructions ever again.

     * But for utrace_control and utrace_set_events to be robust, they

     * must be sure whether or not we will run any more callbacks.  If

     * a call comes in before we do, taking the lock here synchronizes

     * us so we don't run any callbacks just disabled.  Calls that come

     * in while we're running the callbacks will see the exit.death

     * flag and know that we are not yet fully quiescent for purposes

     * of detach bookkeeping.

     */

    if (in_atomic() || irqs_disabled()) {

        if (! utrace->report_work_added) {

            int rc;

#ifdef STP_TF_DEBUG

            printk(KERN_ERR "%s:%d - adding task_work\n",

                   __FUNCTION__, __LINE__);

#endif

            rc = stp_task_work_add(task,

                           &utrace->report_work);

            if (rc == 0) {

                utrace->report_work_added = 1;

            }

            /* stp_task_work_add() returns -ESRCH if the

             * task has already passed

             * exit_task_work(). Just ignore this

             * error. */

            else if (rc != -ESRCH) {

                printk(KERN_ERR

                       "%s:%d - task_work_add() returned %d\n",

                       __FUNCTION__, __LINE__, rc);

            }

        }

    }

    else {

        stp_spin_lock(&utrace->lock);

        BUG_ON(utrace->death);

        utrace->death = 1;

        utrace->resume = UTRACE_RESUME;

        splice_attaching(utrace);

        stp_spin_unlock(&utrace->lock);



        REPORT_CALLBACKS(, task, utrace, &report, UTRACE_EVENT(DEATH),

                 report_death, engine, -1/*group_dead*/,

                 -1/*signal*/);



        utrace_maybe_reap(task, utrace, false);

        utrace_free(utrace);

    }

}



/*

 * Finish the last reporting pass before returning to user mode.

 */

‌static void finish_resume_report(struct task_struct *task,

                 struct utrace *utrace,

                 struct utrace_report *report)

{

    finish_report_reset(task, utrace, report);



    switch (report->action) {

    case UTRACE_STOP:

        utrace_stop(task, utrace, report->resume_action);

        break;



    case UTRACE_INTERRUPT:

        if (!signal_pending(task)) {

            stp_task_notify_resume(task, utrace);

            set_tsk_thread_flag(task, TIF_SIGPENDING);

        }

        break;



    case UTRACE_REPORT:

    case UTRACE_RESUME:

    default:

        break;

    }

}



/*

 * This is called when TIF_NOTIFY_RESUME had been set (and is now clear).

 * We are close to user mode, and this is the place to report or stop.

 * When we return, we're going to user mode or into the signals code.

 */

‌static void utrace_resume(struct task_work *work)

{

    /*

     * We could also do 'task_utrace_struct()' here to find the

     * task's 'struct utrace', but 'container_of()' should be

     * instantaneous (where 'task_utrace_struct()' has to do a

     * hash lookup).

     */

    struct utrace *utrace = container_of(work, struct utrace, work);

    struct task_struct *task = current;

    INIT_REPORT(report);

    struct utrace_engine *engine;



    might_sleep();

    utrace->task_work_added = 0;



    /* Make sure the task isn't exiting. */

    if (task->flags & PF_EXITING) {

        /* Remember that this task_work_func is finished. */

        stp_task_work_func_done();

        return;

    }



    /*

     * Some machines get here with interrupts disabled.  The same arch

     * code path leads to calling into get_signal_to_deliver(), which

     * implicitly reenables them by virtue of spin_unlock_irq.

     */

    local_irq_enable();



    /*

     * Update our bookkeeping even if there are no callbacks made here.

     */

    report.action = start_report(utrace);



    switch (report.action) {

    case UTRACE_RESUME:

        /*

         * Anything we might have done was already handled by

         * utrace_get_signal(), or this is an entirely spurious

         * call.  (The arch might use TIF_NOTIFY_RESUME for other

         * purposes as well as calling us.)

         */



        /* Remember that this task_work_func is finished. */

        stp_task_work_func_done();

        return;

    case UTRACE_INTERRUPT:

        /*

         * Note that UTRACE_INTERRUPT reporting was handled by

         * utrace_get_signal() in original utrace. In this

         * utrace version, we'll handle it here like UTRACE_REPORT.

         *

         * Fallthrough...

         */

    case UTRACE_REPORT:

        if (unlikely(!(utrace->utrace_flags & UTRACE_EVENT(QUIESCE))))

            break;

        /*

         * Do a simple reporting pass, with no specific

         * callback after report_quiesce.

         */

        report.action = UTRACE_RESUME;

        list_for_each_entry(engine, &utrace->attached, entry)

            start_callback(utrace, &report, engine, task, 0);

        break;

    default:

        /*

         * Even if this report was truly spurious, there is no need

         * for utrace_reset() now.  TIF_NOTIFY_RESUME was already

         * cleared--it doesn't stay spuriously set.

         */

        report.spurious = false;

        break;

    }



    /*

     * Finish the report and either stop or get ready to resume.

     * If utrace->resume was not UTRACE_REPORT, this applies its

     * effect now (i.e. step or interrupt).

     */

    finish_resume_report(task, utrace, &report);



    /* Remember that this task_work_func is finished. */

    stp_task_work_func_done();

}





‌static void utrace_report_work(struct task_work *work)

{

    /*

     * We could also do 'task_utrace_struct()' here to find the

     * task's 'struct utrace', but 'container_of()' should be

     * instantaneous (where 'task_utrace_struct()' has to do a

     * hash lookup).

     */

    struct utrace *utrace = container_of(work, struct utrace, report_work);

    struct task_struct *task = current;

    INIT_REPORT(report);

    struct utrace_engine *engine;

    unsigned long clone_flags;



#ifdef STP_TF_DEBUG

    printk(KERN_ERR "%s:%d - atomic %d, irqs_disabled %d\n",

           __FUNCTION__, __LINE__, in_atomic(), irqs_disabled());

#endif

    might_sleep();

    utrace->report_work_added = 0;



    stp_spin_lock(&utrace->lock);

    BUG_ON(utrace->death);

    utrace->death = 1;

    utrace->resume = UTRACE_RESUME;

    splice_attaching(utrace);

    stp_spin_unlock(&utrace->lock);



    REPORT_CALLBACKS(, task, utrace, &report, UTRACE_EVENT(DEATH),

             report_death, engine, -1/*group_dead*/,

             -1/*signal*/);



    utrace_maybe_reap(task, utrace, false);

    utrace_free(utrace);



    /* Remember that this task_work_func is finished. */

    stp_task_work_func_done();

}



#endif    /* _STP_UTRACE_C */
runtime/stp_utrace.c - systemtap

Global variables defined

Data types defined

Functions defined

Macros defined

Source code