/* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */

//#define DEBUGTAG "Signal"

#include "record_signal.h"

#include <assert.h>
#include <fcntl.h>
#include <linux/perf_event.h>
#include <sched.h>
#include <stdlib.h>
#include <string.h>
#include <syscall.h>
#include <sys/mman.h>
#include <sys/user.h>
#include <x86intrin.h>

#include "preload/preload_interface.h"

#include "AutoRemoteSyscalls.h"
#include "Flags.h"
#include "kernel_metadata.h"
#include "log.h"
#include "PerfCounters.h"
#include "RecordSession.h"
#include "task.h"
#include "TraceStream.h"
#include "util.h"

using namespace rr;
using namespace std;

static __inline__ unsigned long long rdtsc(void) { return __rdtsc(); }

static const int STOPSIG_SYSCALL = 0x80 | SIGTRAP;

template <typename Arch> static size_t sigaction_sigset_size_arch() {
  return Arch::sigaction_sigset_size;
}

static size_t sigaction_sigset_size(SupportedArch arch) {
  RR_ARCH_FUNCTION(sigaction_sigset_size_arch, arch);
}

/**
 * Restore the blocked-ness and sigaction for SIGSEGV from |t|'s local
 * copy.
 */
static void restore_sigsegv_state(Task* t) {
  const vector<uint8_t>& sa = t->signal_action(SIGSEGV);
  AutoRemoteSyscalls remote(t);
  {
    AutoRestoreMem child_sa(remote, sa.data(), sa.size());
    int ret = remote.syscall(syscall_number_for_rt_sigaction(remote.arch()),
                             SIGSEGV, child_sa.get().as_int(), nullptr,
                             sigaction_sigset_size(remote.arch()));
    ASSERT(t, 0 == ret) << "Failed to restore SIGSEGV handler";
  }
  // NB: we would normally want to restore the SIG_BLOCK for
  // SIGSEGV here, but doing so doesn't change the kernel's
  // "SigBlk" mask.  There's no bug observed in the kernel's
  // delivery of SIGSEGV after the RDTSC trap, so we do nothing
  // here and move on.
}

/** Return true iff |t->ip()| points at a RDTSC instruction. */
static const uint8_t rdtsc_insn[] = { 0x0f, 0x31 };
static bool is_ip_rdtsc(Task* t) {
  uint8_t insn[sizeof(rdtsc_insn)];
  if (sizeof(insn) != t->read_bytes_fallible(t->ip(), sizeof(insn), insn)) {
    return false;
  }
  return !memcmp(insn, rdtsc_insn, sizeof(insn));
}

/**
 * Return true if |t| was stopped because of a SIGSEGV resulting
 * from a rdtsc and |t| was updated appropriately, false otherwise.
 */
static bool try_handle_rdtsc(Task* t, siginfo_t* si) {
  int sig = si->si_signo;
  assert(sig != SIGTRAP);

  if (sig != SIGSEGV || !is_ip_rdtsc(t)) {
    return false;
  }

  unsigned long long current_time = rdtsc();
  Registers r = t->regs();
  r.set_rdtsc_output(current_time);
  r.set_ip(r.ip() + sizeof(rdtsc_insn));
  t->set_regs(r);

  // When SIGSEGV is blocked, apparently the kernel has to do
  // some ninjutsu to raise the RDTSC trap.  We see the SIGSEGV
  // bit in the "SigBlk" mask in /proc/status cleared, and if
  // there's a user handler the SIGSEGV bit in "SigCgt" is
  // cleared too.  That's perfectly fine, except that it's
  // unclear who's supposed to undo the signal-state munging.  A
  // legitimate argument can be made that the tracer is
  // responsible, so we go ahead and restore the old state.
  //
  // One could also argue that this is a kernel bug.  If so,
  // then this is a workaround that can be removed in the
  // future.
  //
  // If we don't restore the old state, at least firefox has
  // been observed to hang at delivery of SIGSEGV.  However, the
  // test written for this bug, fault_in_code_addr, doesn't hang
  // without the restore.
  if (t->is_sig_blocked(SIGSEGV)) {
    restore_sigsegv_state(t);
  }

  t->push_event(Event(EV_SEGV_RDTSC, HAS_EXEC_INFO, t->arch()));
  LOG(debug) << "  trapped for rdtsc: returning " << current_time;
  return true;
}

void disarm_desched_event(Task* t) {
  if (ioctl(t->desched_fd, PERF_EVENT_IOC_DISABLE, 0)) {
    FATAL() << "Failed to disarm desched event";
  }
}

void arm_desched_event(Task* t) {
  if (ioctl(t->desched_fd, PERF_EVENT_IOC_ENABLE, 0)) {
    FATAL() << "Failed to disarm desched event";
  }
}

/**
 * Return the event needing to be processed after this desched of |t|.
 * The tracee's execution may be advanced, and if so |regs| is updated
 * to the tracee's latest state.
 */
static void handle_desched_event(Task* t, const siginfo_t* si) {
  ASSERT(t, (SYSCALLBUF_DESCHED_SIGNAL == si->si_signo &&
             si->si_code == POLL_IN && si->si_fd == t->desched_fd_child))
      << "Tracee is using SIGSYS??? (code=" << si->si_code
      << ", fd=" << si->si_fd << ")";

  /* If the tracee isn't in the critical section where a desched
   * event is relevant, we can ignore it.  See the long comments
   * in syscall_buffer.c.
   *
   * It's OK if the tracee is in the critical section for a
   * may-block syscall B, but this signal was delivered by an
   * event programmed by a previous may-block syscall A. */
  if (!t->syscallbuf_hdr->desched_signal_may_be_relevant) {
    LOG(debug) << "  (not entering may-block syscall; resuming)";
    /* We have to disarm the event just in case the tracee
     * has cleared the relevancy flag, but not yet
     * disarmed the event itself. */
    disarm_desched_event(t);
    t->push_event(Event::noop(t->arch()));
    return;
  }

  /* TODO: how can signals interrupt us here? */

  /* The desched event just fired.  That implies that the
   * arm-desched ioctl went into effect, and that the
   * disarm-desched syscall didn't take effect.  Since a signal
   * is pending for the tracee, then if the tracee was in a
   * syscall, linux has exited it with an -ERESTART* error code.
   * That means the tracee is about to (re-)enter either
   *
   *  1. buffered syscall
   *  2. disarm-desched ioctl syscall
   *
   * We can figure out which one by simply issuing a
   * ptrace(SYSCALL) and examining the tracee's registers.
   *
   * If the tracee enters the disarm-desched ioctl, it's going
   * to commit a record of the buffered syscall to the
   * syscallbuf, and we can safely send the tracee back on its
   * way, ignoring the desched completely.
   *
   * If it enters the buffered syscall, then the desched event
   * has served its purpose and we need to prepare the tracee to
   * be context-switched.
   *
   * An annoyance of the desched signal is that when the tracer
   * is descheduled in interval (C) above, we see normally (see
   * below) see *two* signals.  The current theory of what's
   * happening is
   *
   *  o child gets descheduled, bumps counter to i and schedules
   *    signal
   *  o signal notification "schedules" child, but it doesn't
   *    actually run any application code
   *  o child is being ptraced, so we "deschedule" child to
   *    notify parent and bump counter to i+1.  (The parent
   *    hasn't had a chance to clear the counter yet.)
   *  o another counter signal is generated, but signal is
   *    already pending so this one is queued
   *  o parent is notified and sees counter value i+1
   *  o parent stops delivery of first signal and disarms
   *    counter
   *  o second signal dequeued and delivered, notififying parent
   *    (counter is disarmed now, so no pseudo-desched possible
   *    here)
   *  o parent notifiedand sees counter value i+1 again
   *  o parent stops delivery of second signal and we continue on
   *
   * So we "work around" this by the tracer expecting two signal
   * notifications, and silently discarding both.
   *
   * One really fun edge case is that sometimes the desched
   * signal will interrupt the arm-desched syscall itself.
   * Continuing to the next syscall boundary seems to restart
   * the arm-desched syscall, and advancing to the boundary
   * again exits it and we start receiving desched signals
   * again.
   *
   * That may be a kernel bug, but we handle it by just
   * continuing until we we continue past the arm-desched
   * syscall *and* stop seeing signals. */
  while (true) {
    // Prevent further desched notifications from firing
    // while we're advancing the tracee.  We're going to
    // leave it in a consistent state anyway, so the event
    // is no longer useful.  We have to do this in each
    // loop iteration because a restarted arm-desched
    // syscall may have re-armed the event.
    disarm_desched_event(t);

    t->cont_syscall();
    int sig = t->stop_sig();

    if (STOPSIG_SYSCALL == sig) {
      if (t->is_arm_desched_event_syscall()) {
        continue;
      }
      break;
    }
    // Completely ignore spurious desched signals and
    // signals that aren't going to be delivered to the
    // tracee.
    //
    // Also ignore time-slice signals.  If the tracee ends
    // up at the disarm-desched ioctl, we'll reschedule it
    // with the ticks interrupt still programmed.  At worst,
    // the tracee will get an extra time-slice out of
    // this, on average, so we don't worry too much about
    // it.
    //
    // TODO: it's theoretically possible for this to
    // happen an unbounded number of consecutive times
    // and the tracee never switched out.
    if (SYSCALLBUF_DESCHED_SIGNAL == sig ||
        PerfCounters::TIME_SLICE_SIGNAL == sig || t->is_sig_ignored(sig)) {
      LOG(debug) << "  dropping ignored " << signal_name(sig);
      continue;
    }

    LOG(debug) << "  stashing " << signal_name(sig);
    t->stash_sig();
  }

  if (t->is_disarm_desched_event_syscall()) {
    LOG(debug)
        << "  (at disarm-desched, so finished buffered syscall; resuming)";
    t->push_event(Event::noop(t->arch()));
    return;
  }

  if (t->desched_rec()) {
    // We're already processing a desched. We probably reexecuted the
    // system call (e.g. because a signal was processed) and the syscall
    // blocked again. Carry on with the current desched.
  } else {
    /* This prevents the syscallbuf record counter from being
     * reset until we've finished guiding the tracee through this
     * interrupted call.  We use the record counter for
     * assertions. */
    t->delay_syscallbuf_reset = true;

    /* The tracee is (re-)entering the buffered syscall.  Stash
     * away this breadcrumb so that we can figure out what syscall
     * the tracee was in, and how much "scratch" space it carved
     * off the syscallbuf, if needed. */
    const struct syscallbuf_record* desched_rec =
        next_record(t->syscallbuf_hdr);
    t->push_event(DeschedEvent(desched_rec, t->arch()));
    int call = t->desched_rec()->syscallno;
    /* Replay needs to be prepared to see the ioctl() that arms
     * the desched counter when it's trying to step to the entry
     * of |call|.  We'll record the syscall entry when the main
     * recorder code sees the tracee's syscall event. */
    t->record_current_event();

    /* Because we set the |delay_syscallbuf_reset| flag and the
     * record counter will stay intact for a bit, we need to also
     * prevent later events from flushing the syscallbuf until
     * we've unblocked the reset. */
    t->delay_syscallbuf_flush = true;

    /* The descheduled syscall was interrupted by a signal, like
     * all other may-restart syscalls, with the exception that
     * this one has already been restarted (which we'll detect
     * back in the main loop). */
    t->push_event(Event(interrupted, SyscallEvent(call, t->arch())));
    SyscallEvent& ev = t->ev().Syscall();
    ev.desched_rec = desched_rec;
  }

  SyscallEvent& ev = t->ev().Syscall();
  ev.regs = t->regs();
  /* For some syscalls (at least poll) but not all (at least not read),
   * repeated cont_syscall()s above of the same interrupted syscall
   * can set $orig_eax to 0 ... for unclear reasons. Fix that up here
   * otherwise we'll get a divergence during replay, which will not
   * encounter this problem.
   */
  int call = t->desched_rec()->syscallno;
  ev.regs.set_original_syscallno(call);
  t->set_regs(ev.regs);
  ev.state = EXITING_SYSCALL;

  LOG(debug) << "  resuming (and probably switching out) blocked `"
             << t->syscall_name(call) << "'";
}

static void record_signal(Task* t, const siginfo_t& si) {
  t->push_event(SignalEvent(si, is_deterministic_signal(si), t->arch()));
}

static bool is_safe_to_deliver_signal(Task* t) {
  struct syscallbuf_hdr* hdr = t->syscallbuf_hdr;

  if (!hdr) {
    /* Can't be in critical section because the lock
     * doesn't exist yet! */
    return true;
  }

  if (!t->is_in_syscallbuf()) {
    /* The tracee is outside the syscallbuf code,
     * so in most cases can't possibly affect
     * syscallbuf critical sections.  The
     * exception is signal handlers "re-entering"
     * desched'd syscalls, which are OK. */
    return true;
  }

  if (t->is_in_traced_syscall()) {
    LOG(debug) << "  tracee at traced syscallbuf syscall";
    return true;
  }

  if (t->is_in_untraced_syscall() && t->desched_rec()) {
    LOG(debug) << "  tracee interrupted by desched of "
               << t->syscall_name(t->desched_rec()->syscallno);
    return true;
  }

  // Our emulation of SYS_rrcall_notify_syscall_hook_exit clears this flag.
  hdr->notify_on_syscall_hook_exit = true;
  return false;
}

SignalHandled handle_signal(Task* t, siginfo_t* si) {
  LOG(debug) << t->tid << ": handling signal " << signal_name(si->si_signo)
             << " (pevent: " << t->ptrace_event() << ", event: " << t->ev();

  /* We have to check for a desched event first, because for
   * those we *do not* want to (and cannot, most of the time)
   * step the tracee out of the syscallbuf code before
   * attempting to deliver the signal. */
  if (SYSCALLBUF_DESCHED_SIGNAL == si->si_signo) {
    handle_desched_event(t, si);
    return SIGNAL_HANDLED;
  }

  if (!is_safe_to_deliver_signal(t)) {
    return DEFER_SIGNAL;
  }

  t->set_siginfo_for_synthetic_SIGCHLD(si);

  /* See if this signal occurred because of an rr implementation detail,
   * and fudge t appropriately. */
  switch (si->si_signo) {
    case SIGSEGV:
      if (try_handle_rdtsc(t, si)) {
        return SIGNAL_HANDLED;
      }
      break;

    case PerfCounters::TIME_SLICE_SIGNAL:
      t->push_event(Event(EV_SCHED, HAS_EXEC_INFO, t->arch()));
      return SIGNAL_HANDLED;
  }

  /* This signal was generated by the program or an external
   * source, record it normally. */

  if (t->emulate_ptrace_stop((si->si_signo << 8) | 0x7f,
                             SIGNAL_DELIVERY_STOP)) {
    // ptracer has been notified, so don't deliver the signal now.
    // The signal won't be delivered for real until the ptracer calls
    // PTRACE_CONT with the signal number (which we don't support yet!).
    return SIGNAL_PTRACE_STOP;
  }

  record_signal(t, *si);
  return SIGNAL_HANDLED;
}
