runtime/linux/perf.c - systemtap

Data types defined

Functions defined

Macros defined

Source code

/* -*- linux-c -*-
* Perf Functions
* Copyright (C) 2006-2014 Red Hat Inc.
*
* This file is part of systemtap, and is free software.  You can
* redistribute it and/or modify it under the terms of the GNU General
* Public License (GPL); either version 2, or (at your option) any
* later version.
*/

#ifndef _PERF_C_
#define _PERF_C_

#include <linux/perf_event.h>
#include <linux/workqueue.h>

#include "perf.h"

#ifndef INIT_WORK_ONSTACK
#define INIT_WORK_ONSTACK(_work, _func) INIT_WORK((_work), (_func))
#define destroy_work_on_stack(_work) do { (void)(_work); } while (0)
#endif

/** @file perf.c
* @brief Implements performance monitoring hardware support
*/

/** Initialize performance sampling
* Call this during probe initialization to set up performance event sampling
* for all online cpus.  Returns non-zero on error.
*
* @param stp Handle for the event to be registered.
*/
static long _stp_perf_init (struct stap_perf_probe *stp, struct task_struct* task)
{
    int cpu;

    if (!stp->system_wide) {
      if (task == 0) /* need to setup later when we know the task */
        return 0;
      else  {
        if (stp->e.t.per_thread_event != 0) /* already setup */
          return 0;
        stp->e.t.per_thread_event = perf_event_create_kernel_counter(&stp->attr,
                                     -1,
#if defined(STAPCONF_PERF_STRUCTPID) || defined (STAPCONF_PERF_COUNTER_CONTEXT)
                                     task,
#else
                                     task->pid,
#endif
                                     stp->callback
#ifdef STAPCONF_PERF_COUNTER_CONTEXT
                                     , NULL
#endif
                                     );
        if (IS_ERR(stp->e.t.per_thread_event)) {
          long rc = PTR_ERR(stp->e.t.per_thread_event);
          stp->e.t.per_thread_event = NULL;

          /*
           * PPC returns ENXIO for HW counters until 2.6.37
           * (behavior changed with commit b0a873e).
           */
          if (rc == -EINVAL || rc == -ENOSYS || rc == -ENOENT
          || rc == -EOPNOTSUPP || rc == -ENXIO) {
            _stp_warn("perf probe '%s' is not supported by this kernel (%ld).",
#ifdef STP_NEED_PROBE_NAME
              stp->probe->pn,
#else
              stp->probe->pp,
#endif
              rc);
        /* Lie and return 0. This way the more generic
         * task_finder warning won't be printed. */
        rc = 0;
          }
          return rc;
        }
      }
    }
    else {
      /* allocate space for the event descriptor for each cpu */
      stp->e.events = _stp_alloc_percpu (sizeof(struct perf_event*));
      if (stp->e.events == NULL) {
        return -ENOMEM;
      }

      /* initialize event on each processor */
      for_each_possible_cpu(cpu) {
        struct perf_event **event = per_cpu_ptr (stp->e.events, cpu);
        if (cpu_is_offline(cpu)) {
          *event = NULL;
          continue;
        }
        *event = perf_event_create_kernel_counter(&stp->attr,
                              cpu,
#if defined(STAPCONF_PERF_STRUCTPID) || defined (STAPCONF_PERF_COUNTER_CONTEXT)
                              NULL,
#else
                              -1,
#endif
                              stp->callback
#ifdef STAPCONF_PERF_COUNTER_CONTEXT
                              , NULL
#endif
                              );

        if (IS_ERR(*event)) {
          long rc = PTR_ERR(*event);
          *event = NULL;
          _stp_perf_del(stp);
          return rc;
        }
      }
    } /* (stp->system_wide) */
    return 0;
}

/** Delete performance event.
* Call this to shutdown one performance event sampling
*
* @param stp Handle for the event to be unregistered.
*/
static void _stp_perf_del (struct stap_perf_probe *stp)
{
  int cpu;
  if (! stp || !stp->e.events)
    return;

  /* shut down performance event sampling */
  if (stp->system_wide) {
    for_each_possible_cpu(cpu) {
      struct perf_event **event = per_cpu_ptr (stp->e.events, cpu);
      if (*event) {
    perf_event_release_kernel(*event);
      }
    }
    _stp_free_percpu (stp->e.events);
    stp->e.events = NULL;
  }
  else {
    if (stp->e.t.per_thread_event) {
      perf_event_release_kernel(stp->e.t.per_thread_event);
    }
    stp->e.t.per_thread_event = NULL;
  }
}

/** Delete many performance events in reverse order.
* Call this to shutdown all performance event sampling
*
* @param probes A pointer array for the events to be unregistered.
* @param n The number of events in the array.
*/
static void _stp_perf_del_n (struct stap_perf_probe *probes, size_t n)
{
  while (n--)
    _stp_perf_del(&probes[n]);
}

struct _stp_perf_work {
  struct work_struct work;
  struct stap_perf_probe *probes;
  size_t nprobes;
  const char* probe_point;
  int rc;
};

/** Initialize many performance events from a workqueue
* Even though we're using the kernel interface, perf checks CAP_SYS_ADMIN,
* which our mere @stapdev user may not have.  By running via a workqueue,
* we'll be in an events/X kernel thread with sufficient privileges.
*
* @param work The _stp_perf_work encapsulating _stp_perf_init_n parameters.
*/
static void _stp_perf_init_work (struct work_struct *work)
{
  size_t i;
  struct _stp_perf_work *pwork =
    container_of(work, struct _stp_perf_work, work);

  for (i = 0; i < pwork->nprobes; ++i) {
    struct stap_perf_probe* stp = &pwork->probes[i];

    if (stp->system_wide)
      pwork->rc = _stp_perf_init(stp, NULL);
    else if (stp->task_finder)
#ifdef STP_PERF_USE_TASK_FINDER
      pwork->rc = stap_register_task_finder_target(&stp->e.t.tgt);
#else
      pwork->rc = EINVAL;
#endif

    if (pwork->rc) {
      pwork->probe_point = stp->probe->pp;
      _stp_perf_del_n(pwork->probes, i);
      break;
    }
  }
}

/** Initialize many performance events
* Call this to start all performance event sampling
*
* @param probes A pointer array for the events to be registered.
* @param n The number of events in the array.
* @param ppfail A pointer to return the probe_point on failure.
*/
static int _stp_perf_init_n (struct stap_perf_probe *probes, size_t n,
                 const char **ppfail)
{
  struct _stp_perf_work pwork = { .probes = probes, .nprobes = n };
  INIT_WORK_ONSTACK(&pwork.work, _stp_perf_init_work);
  schedule_work(&pwork.work);
  flush_work(&pwork.work);
  if (pwork.rc)
    *ppfail = pwork.probe_point;
  destroy_work_on_stack(&pwork.work);
  return pwork.rc;
}


/*
The first call to _stp_perf_init, via systemtap_module_init at runtime, is for
setting up aggregate counters.  Per thread counters need to be setup when the
thread is known.  This is done by calling _stp_perf_init later when the thread
is known.  A per thread perf counter is defined by a counter("var") suffix on
the perf probe.  It is defined by perf_builder.  This counter is read on demand
via the "@perf("var")" builtin which is treated as an expression right hand side
which reads the perf counter associated with the previously defined perf
counter.  It is expanded by dwarf_var_expanding_visitor
*/

static int _stp_perf_read_init (unsigned i, struct task_struct* task)
{
  /* Choose the stap_perf_probes entry */
  struct stap_perf_probe* stp = & stap_perf_probes[i];

  return _stp_perf_init (stp, task);
}


long _stp_perf_read (int ncpu, unsigned i)
{
  /* Choose the stap_perf_probes entry */
  struct stap_perf_probe* stp;
  u64 enabled, running;

  if (i > sizeof(stap_perf_probes)/sizeof(struct stap_perf_probe))
    {
      _stp_error ("_stp_perf_read\n");
      return 0;
    }
  stp = & stap_perf_probes[i];

  if (stp == NULL || stp->e.t.per_thread_event == NULL)
    {
      _stp_error ("_stp_perf_read\n");
      return 0;
    }

  might_sleep();
  return perf_event_read_value (stp->e.t.per_thread_event, &enabled, &running);

}


#endif /* _PERF_C_ */