
/*--------------------------------------------------------------------*/
/*--- Callgrind                                                    ---*/
/*---                                                    ct_main.c ---*/
/*--------------------------------------------------------------------*/

/*
   This file is part of Callgrind, a Valgrind skin for call graph
   profiling programs.

   Copyright (C) 2002-2004, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)

   This skin is derived from and contains lot of code from Cachegrind
   Copyright (C) 2002 Nicholas Nethercote (njn25@cam.ac.uk)

   This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License as
   published by the Free Software Foundation; either version 2 of the
   License, or (at your option) any later version.

   This program is distributed in the hope that it will be useful, but
   WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
   02111-1307, USA.

   The GNU General Public License is contained in the file COPYING.
*/

#include "vg_skin.h"
#include "calltree.h"
//#include "vg_profile.c"

#if VG_CORE_INTERFACE_MAJOR_VERSION > 4
void SK_(pre_clo_init)(void);
VG_DETERMINE_INTERFACE_VERSION(SK_(pre_clo_init), 0)
#else
VG_DETERMINE_INTERFACE_VERSION
#endif

/* For cache simulation */
typedef struct {
    int size;       /* bytes */ 
    int assoc;
    int line_size;  /* bytes */ 
} cache_t;

#include "ct_sim.c"


/*------------------------------------------------------------*/
/*--- Preprocessor options                                 ---*/
/*------------------------------------------------------------*/


/*
 * JCC Generation
 */

/* use JumpCall cost centers */
#define JCC 1


#if JCC
#define JCC_INSTR 1
#define JCC_FIN   1
#define JCC_DUMP  1
#endif



/*
 * Debug options
 */

/* Enable assertions for cc/fcc operations */
#define ENABLE_CC_ASSERTIONS 0

/* Set to 1 to flood stderr with cachesim related debug output */
#define JCC_DEBUG 1

/* Set to 1 if you want full sanity checks for JCC */
#define JCC_CHECK 0


#if JCC_DEBUG
#define CT_DEBUGIF(x) \
  if ((clo_ct_verbose>x) && (bb_executions>=clo_ct_verbose_start))

#define CT_DEBUG(x,format,args...)               \
    CT_DEBUGIF(x) {                              \
      if (!jcc_debug_bb_written) {               \
        jcc_debug_bb_written = True;             \
        VG_(printf)("BB# %llu\n",bb_executions); \
      }                                          \
      VG_(printf)(format,##args);                \
    }

#define CT_ASSERT(cond)                          \
    if (!(cond)) {                               \
      CT_DEBUG(0,"In tid %d [%d] %s\n",          \
	       current_tid, call_stack_sp,       \
	       mangled_cxt(current_cxt,current_bbcc ? current_bbcc->rec_index : 0)); \
      sk_assert(cond);                           \
     }

#else
#define CT_DEBUGIF(x) if (0)
#define CT_DEBUG(x,y) {}
#define CT_ASSERT(cond) sk_assert(cond);
#endif



/*
  Output options
 */

/* Write a NewLine after each BB */
#define DUMP_BB 1

/* Write each Instr on its own line */
#define DUMP_INSTR 1



/*
 * Performance/space options
 */

/* JmpCall Stack, following the real ESP, resizable */
#define N_STACK_INITIAL_ENTRIES 500

/* Function context stack */
#define N_FNSTACK_INITIAL_ENTRIES 500

/* BB hash, resizable */
#define N_BB_INITIAL_ENTRIES  8437

/* BBCC hash 1, resizable */
#define N_BBCC1_INITIAL_ENTRIES  10437

/* JCC hash, resizable */
#define N_JCC_INITIAL_ENTRIES  4437

/* Context hash, resizable */
#define N_CXT_INITIAL_ENTRIES 2537

/* Function info table */
#define N_FNINFO_INITIAL_ENTRIES 3457

/* BBCC 2nd hash, fixed
 * Each hash table is separately chained.  The sizes below work
 * fairly well for Konqueror. */

#define   N_OBJ_ENTRIES         47
#define  N_FILE_ENTRIES         53
#define    N_FN_ENTRIES         87
#define N_BBCC2_ENTRIES         37



/*------------------------------------------------------------*/
/*--- Command line args                                    ---*/
/*------------------------------------------------------------*/

/* Base name for dumps */
Char* clo_filename_base = 0;

/* Dump trace parts as separate files? */
Bool clo_separate_dumps;
/* Separate threads in dump? */
Bool clo_dump_threads;
/* Compress strings in profile dump? */
Bool clo_compress_strings;
/* Compress events in profile dump? */
Bool clo_compress_events;

/* Do cache simulation ? */
Bool clo_simulate_cache;
/* Do HW prefetch simulation ? */
Bool clo_simulate_hwpref;

/* Start in collecting state ? */
Bool clo_collect_state;
/* Trace conditional jumps/arcs inside of a function ? */
Bool clo_trace_jump;
/* Dump every xxx BBs. default: 0 = never */
Int clo_dumps;
/* Skip functions in PLT section? default: NO */
Bool clo_skip_plt = False;
/* Costs of a function dependent on how many callers? default: 0  */
Int clo_fn_caller = 0;
/* Max level of recursion for separate costs? default: 1 */
Int clo_fn_recursion = 1;
/* Let direct recursions increment rec.level? default: no */
Bool clo_skip_direct_recursion = False;
/* Mangle separation info into function names? default: no */
Bool clo_mangle_names = False;
/* Compress parts of mangled name? default: no */
Bool clo_compress_mangled = False;
/* Positions to dump: instruction addresse (def:no) / source line (def:yes) */
Bool clo_dump_line = True;
Bool clo_dump_instr = False;
Bool clo_dump_bb = False;
/* Use position differences? */
Bool clo_compress_pos = False;
/* Dump basic block information (instead of cost per source line)? def:no */
Bool clo_dump_bbs = False;
/* Dump info for skipped functions in a call? def:no */
Bool clo_dump_skipped = False;
#if JCC_DEBUG
/* Only if debugging is enabled: Verbosity level */
Int clo_ct_verbose = 0;
Bool jcc_debug_bb_written = False;
ULong clo_ct_verbose_start = 0;
#endif

/*------------------------------------------------------------*/
/*--- Constants                                            ---*/
/*------------------------------------------------------------*/


/* According to IA-32 Intel Architecture Software Developer's Manual: Vol 2 */
#define MAX_x86_INSTR_SIZE              16

#define MIN_LINE_SIZE   16

/* Size of various buffers used for storing strings */
#define FILENAME_LEN                    256
#define FN_NAME_LEN                    4096 /* for C++ code :-) */
#define OBJ_NAME_LEN                    256
#define BUF_LEN                         512
#define COMMIFY_BUF_LEN                 128
#define RESULTS_BUF_LEN                 128
#define LINE_BUF_LEN                     64

/*------------------------------------------------------------*/
/*--- Profiling events                                     ---*/
/*------------------------------------------------------------*/

typedef 
   enum { 
      VgpCacheGetBBCC = VgpFini+1,
      VgpCacheDump,
      VgpCacheSimulate,
      VgpCacheResults,
      VgpCacheCallDepth,
      VgpCacheCalltree,
      VgpCacheSetup
   }
   VgpSkinCC;


/*------------------------------------------------------------*/
/*--- Forward declarations of functions and structs        ---*/
/*------------------------------------------------------------*/

typedef struct _Context     Context;
typedef struct _CC          CC;
typedef struct _BB          BB;
typedef struct _Skipped     Skipped;
typedef struct _BBCC        BBCC;
typedef struct _jCC         jCC;
typedef struct _fCC         fCC;
typedef struct _fn_node     fn_node;
typedef struct _file_node   file_node;
typedef struct _obj_node    obj_node;
typedef struct _fn_config   fn_config;
typedef struct _fn_info     fn_info;
typedef struct _call_entry  call_entry;
typedef struct _thread_cxtinfo thread_cxtinfo;
typedef struct _thread_info thread_info;
typedef struct _event_type  event_type;

#define CONFIG_DEFAULT -1
#define CONFIG_FALSE   0
#define CONFIG_TRUE    1

/* Logging configuration for a function */
struct _fn_config {
    Char* name;

    Int dump_before;
    Int dump_after;
    Int zero_before;
    Int toggle_collect;

    Int skip;    /* Handle CALL to this function as JMP (= Skip)? */
    Int group;   /* don't change caller dependency inside group !=0 */

    Int fn_caller;    /* separate logging dependent on caller  */
    Int fn_recursion; /* separate logging of rec. levels       */

#if JCC_DEBUG
    Int verbosity; /* Change debug verbosity level while in function */
#endif

    fn_config* next;
};



/* A simple cost metric with 2 dependencies:
 * Cost m1 always happens with a, and m2 always happens with m1.
 * E.g. a 2nd level cache miss can only happen with a 1st level miss
 *      and a data access
 *
 * This semantic is used in operations add_cc/add_diff_cc.
 */
struct _CC {
   ULong a;
   ULong m1;
   ULong m2;
};


/* a full cost center - with instruction, data read and write costs.
 * No further assertions...
 * (with REP an instruction can produce data > instruction accesses)
 */
struct _fCC {
    CC Ir;
    CC Dr;
    CC Dw;
};




/* JmpCall cost center
 * for subroutine call (from->bb->jmp_addr => to->bb->addr)
 *
 * Each BB has at most one CALL instruction. The list of JCC from
 * this call is a pointer to the list head (stored in BBCC), and
 * <next_from> in the JCC struct.
 *
 * For fast lookup, JCCs are reachable with a hash table, keyed by
 * the (from_bbcc,to) pair. <next_hash> is used for the JCC chain
 * of one hash table entry.
 *
 * Cost <sum> holds event counts for already returned executions.
 * <last> are the event counters at last enter of the subroutine.
 * <sum> is updated on returning from the subroutine by
 * adding the diff of <last> and current event counters to <sum>.
 *
 * After updating, <last> is set to current event counters. Thus,
 * events are not counted twice for recursive calls (TODO: True?)
 */

#define JmpNone -1 
#define JmpCond -2

struct _jCC {
    Int  jmpkind;     /* JmpCall, JmpBoring, JmpCond */
    jCC* next_hash;   /* for hash entry chain */
    jCC* next_from;   /* next JCC from a BBCC */
    BBCC *from, *to;  /* call arc from/to this BBCC */

    ULong call_counter; /* no wraparound with 64 bit */
    fCC sum;
};


/**
 * An instrumented basic block (BB).
 *
 * To each BB, a call (setup_bbcc) specifying a pointer to the 
 * according BB structure is added in front of the BB code
 * at instrumentation time.
 * As cost of a BB has to be distinguished depending on the context,
 * multiple cost centers for one BB (struct BBCC) exist and are
 * calculated by setup_bbcc.
 *
 * BBs are put into a resizable hash to allow for fast detection if a
 * BB is to be retranslated but cost info already available.
 */
struct _BB {
    Addr       addr;       /* a address of 0 means: discarded */
    UInt       size;       /* length of BB in bytes */
    Addr       jmp_addr;   /* address of the 1st jmp instruction in this BB */
    obj_node*  obj;        /* ELF obj where this BB is in */
    fn_node*   fn;         /* debug info for this BB */
    Int        line;       /* line debug info */
    Bool       is_entry;   /* True if this BB is a function entry */
        
    VgSectKind sect_kind;  /* section of this BB, e.g. PLT */
    UInt       array_size; /* byte-size of variable length array in BBCCs */
    UInt       instr_count;

    BBCC*      bbcc_list;  /* BBCCs for same BB (see next_bbcc in BBCC) */
    BBCC*      last_bbcc;  /* Temporary: Cached for faster access (LRU) */
    BB*        next;       /* chaining for a hash entry */
};



/**
 * Execution context
 *
 * Basic blocks are always executed in a context.
 * A execution context is a list of function nodes.
 * Recursion levels are handled for active function and thread.
 *
 * To get a unique number for a full execution context, use
 *  rec_index = min(<fn->rec_separation>,<active>) - 1;
 *  unique_no = <number> + rec_index
 *
 * For each Context, recursion index and BB, there can be a BBCC.
 */
struct _Context {
    UInt size;        // number of function dependencies
    UInt base_number; // for context compression & dump array
    Context* next;    // entry chaining for hash
    UInt hash;        // for faster lookup...
    fn_node* fn[0];
};

/*
 * Skipped cost item
 *
 * Cost for skipped functions is collected in this struct,
 * allocated lazy when a call is done to a skipped function from
 * <fromBBCC>. We note the skipped context.
 * As many functions could be skipped in a call chain,
 * skipped cost items form a list. 
 */
struct _Skipped {
    fn_node* fn;
    BBCC* from;
    Skipped* next;
    fCC fcc;
};

/*
 * Basic Block Cost Center
 *
 * On demand, multiple BBCCs will be created for the same BB
 * dependend on command line options and:
 * - current function (it's possible that a BB is executed in the
 *   context of different functions, e.g. in manual assembler/PLT)
 * - current thread ID
 * - position where current function is called from
 * - recursion level of current function
 *
 * The cost centres for the instructions of a basic block are
 * stored in a contiguous array.
 * They are distinguishable by their tag field.
 */
struct _BBCC {
    BB*      bb;           /* BB for this cost center */

    Context* cxt;          /* execution context of this BBCC */
    ThreadId tid;          /* only for assertion check purpose */
    UInt     rec_index;    /* Recursion index in rec->bbcc for this bbcc */

    ULong    exe_counter;  /* execution counter for BB in this context */
    ULong    ret_counter;  /* how often returned from jccs of this bbcc */

    BBCC**   rec_array;    /* Array of pointers to recursion BBCCs */
    BBCC*    next_bbcc;    /* Chain of BBCCs for same BB */
    BBCC*    lru_next_bbcc; /* BBCC executed next the last time */
    
    jCC*     lru_from_jcc; /* Temporary: Cached for faster access (LRU) */
    jCC*     lru_to_jcc;   /* Temporary: Cached for faster access (LRU) */
    jCC*     jcc_list;     /* list of arcs called from jmp_addr */
    Skipped* skipped;      /* cost for skipped functions called from 
			    * jmp_addr. Allocated lazy */

    BBCC*    next1;         /* entry chain in 1st hash */
    BBCC*    next2;         /* entry chain in 2nd hash */
    Addr     array;         /* pointer to variable length array. If
			     * clo_simulate_cache is false, this points
			     * to same array for all BBCC of one BB
			     * (we only need instr. addresses for dump) */
};


/* the <number> of fn_node, file_node and obj_node are for compressed dumping
 * and a index into the dump boolean table and fn_info_table
 */

struct _fn_node {
    Char*      name;
    UInt       number;
    Context*   last_cxt; /* LRU info */
    Context*   pure_cxt; /* the context with only the function itself */
    file_node* file;     /* reverse mapping for 2nd hash */
    fn_node* next;

    Bool dump_before;
    Bool dump_after;
    Bool zero_before;
    Bool toggle_collect;
    Bool skip;
    Int  group;
    Int  fn_caller;
    Int  fn_recursion;
#if JCC_DEBUG
    Int  verbosity; /* Stores old verbosity level while in function */
#endif
};

struct _file_node {
   Char*      name;
   fn_node*   fns[N_FN_ENTRIES];
   UInt       number;
   obj_node*  obj;
   file_node* next;
};

/* Objects can already be discarded at the time of the dump.
 * FIXME: Hope the <name> is unique, and a shared object is
 *        not dlopened multiple times at different addesses
 */
struct _obj_node {
   Char*      name;
   Addr       start;
   UInt       size;
   UInt       offset;
   file_node* files[N_FILE_ENTRIES];
   UInt       number;
   obj_node*  next;
};

/* function and thread specific info.
 * Used in fn_info_table, using fn_node->number as index
 */
struct _fn_info {
    ULong call_counter;           /* how many times called at all */
    BBCC* bbccs[N_BBCC2_ENTRIES]; /* hash for fast deletion */
};


/* an entry in the callstack
 *
 * <nonskipped> is 0 if the function called is not skipped (usual case).
 * Otherwise, it is the last non-skipped BBCC. This one gets all
 * the calls to non-skipped functions and all costs in skipped 
 * instructions.
 */
struct _call_entry {
    jCC* jcc;           /* jCC for this call */
    fCC  fcc;           /* cost event counters at entering frame */
    Addr esp;           /* ESP at call time */
    BBCC* nonskipped;   /* see above */
    Context* cxt;       /* context before call */
    Int fn_sp;          /* function stack index before call */
};


/*
 * Thread context state while inside of a signal handler or in another thread.
 * As there's no scheduling among running signal handlers of one thread,
 * we only need a subset of a full thread state:
 * - event counter
 * - collect state
 * - last BB, last jump kind, last nonskipped BB
 * - callstack pointer for sanity checking and correct unwinding
 *   after exit
 */
struct _thread_cxtinfo {

    /* the signum of the handler, 0 for main thread context
     */
    Int sigNum;

    /* the call stack pointer at entering the signal handler */
    Int call_stack_bottom;

    fCC current;
    Bool collect_state;
    Context* current_cxt;

    /* last BB executed */
    Int  bbcc_jmpkind;
    BBCC* current_bbcc;
    BBCC* current_nonskipped;
};

/* The maximum number of simultaneous running signal handlers per thread.
 * This is used for the fixed size of stack of _thread_siginfo structs
 * in the thread state */
#define MAX_SIGHANDLERS 8

/* Thread State 
 *
 * This structure stores thread specific info while a thread is *not*
 * running. See function switch_thread() for save/restore on thread switch.
 *
 * If --dump-threads=no, BBCCs and JCCs can be shared by all threads, i.e.
 * only structures of thread 1 are used.
 * This involves variables fn_info_table, bbcc_table and jcc_table.
 */
struct _thread_info {

    /* resizable call stack */
    call_entry* stack;
    Int   stack_size;
    Int   stack_sp;

    /* event counters */
    fCC last;            /* counters at last dump */
    fCC discards;        /* events for discarded code */

    /* execution context */
    fn_node** fn_stack;
    fn_node** fn_stack_top;
    Int fn_stack_size;

     /* execution contexts (main thread + signal handlers) */
    thread_cxtinfo* cxt_stack[MAX_SIGHANDLERS];
    Int cxt_stackpointer; /* > 0 if a handler is running */
    fCC handler_sum;
    
    /* function active counts (used for fn-naming with recursion) */
    UInt* fn_active_array;
    Int fn_active_array_size;

    /* function info array: call counts & bbcc hash */
    fn_info** fn_info_table;
    Int       fn_info_table_size;

    /* thread dependend hashes: jcc, bbcc */
    Int jcc_table_size;
    Int jcc_table_entries;
    jCC **jcc_table;
    jCC *jcc_spontaneous;

    Int bbcc_table_size;
    Int bbcc_table_entries;
    BBCC ** bbcc_table;
};


/*------------------------------------------------------------*/
/*--- Static global variables                              ---*/
/*------------------------------------------------------------*/


/*
 * Statistics
 */
static ULong call_counter        = 0;
static ULong jcnd_counter        = 0;
static ULong jump_counter        = 0;
static ULong rec_call_counter    = 0;
static ULong ret_counter         = 0;
static ULong bb_executions       = 0;

static Int  context_counter     = 0;
static Int  distinct_objs       = 0;
static Int  distinct_files      = 0;
static Int  distinct_fns        = 0;
static Int  distinct_contexts   = 0;
static Int  distinct_bbs        = 0;
static Int  distinct_bbccs      = 0;
static Int  distinct_instrs     = 0;
static Int  distinct_skips      = 0;
static Int  full_debug_BBs      = 0;
static Int  file_line_debug_BBs = 0;
static Int  fn_name_debug_BBs   = 0;
static Int  no_debug_BBs        = 0;
static Int  BB_retranslations   = 0;
static Int  bb_table_resizes    = 0;
static Int  bbcc_lru_misses     = 0;
static Int  bbcc_table_resizes  = 0;
static Int  jcc_lru_misses      = 0;
static Int  jcc_table_resizes   = 0;
static Int  cxt_lru_misses      = 0;
static Int  bbcc_clones         = 0;
static Int  fn_info_table_resizes = 0;
static Int  fn_active_array_resizes = 0;


/*
 * Cost event types
 *
 * Which events should be tracked?
 * costtype_register() setups the following vars.
 * The FCCs in the JCC/BBCC structure are changed on demand if a
 * subfunction adds a new cost type.
 */
#define N_MAX_COSTTYPE 20
static Bool   costtype_first_of_cc[N_MAX_COSTTYPE];
static UChar* costtype_name[N_MAX_COSTTYPE];
static Int    costtype_count = 0;

/*
 * I/O related
 */

#define DEFAULT_DUMPNAME    "cachegrind.out"
#define DEFAULT_COMMANDNAME "cachegrind.cmd"
#define DEFAULT_RESULTNAME  "cachegrind.res"
#define DEFAULT_INFONAME    "/tmp/cachegrind.info"

/* Dump Part Counter */
static Int out_counter = 0;

static Char* info_file = 0;
static Char* command_file = 0;
static Char* command_file2 = 0;
static Char* result_file = 0;
static Char* dump_file_base = 0;
static Char* base_directory = 0;

/* Total reads/writes/misses sum over all dumps and threads.
 * Updated during CC traversal at dump time.
 */
static fCC total_fcc;
static fCC dump_total_fcc;

/*
 * Thread specific global vars, switched on thread switch
 */

/* current running thread */
static ThreadId current_tid;

/* event counters */
static fCC* current_fcc = 0;
static fCC* discards_fcc = 0;
static Bool collect_state = True;

/* Execution context */
static Int current_sigNum = 0;  /* >0 if signal handler is running */
static fn_node** fn_stack = 0;
static fn_node** fn_stack_top = 0;
static Int fn_stack_size = 0;
static Context* current_cxt = 0;


/* At the end of each BB, we store the JmpKind.
 * Special values: -1 is "unset", -2 is "conditional jump".
 */
static Int  bbcc_jmpkind;
/* in setup_bbcc, we calculate the BBCC that gets the costs */
static BBCC* current_bbcc;

/* Function active counter array, indexed by function number
 */
static UInt* fn_active_array;
static Int fn_active_array_size;

/* function info table, indexed by function number */
static fn_info** fn_info_table;
static Int       fn_info_table_size;
static Int       fn_info_table_entries = 0;


/*
 * Hash tables
 */


/* JCC hash table */
static Int jcc_table_size    = 0;
static Int jcc_table_entries = 0;
static jCC **jcc_table = 0;
static jCC *jcc_spontaneous = 0;


/* BB hash table */
static Int   bb_table_size    = 0;
static Int   bb_table_entries = 0;
static BB  **bb_table = 0;


/* Context hash table */
static Int   cxt_table_size    = 0;
static Int   cxt_table_resizes = 0;
static Int   cxt_table_entries = 0;
static Context **cxt_table = 0;


/* BBCC table (key is Context/BB), resizable */
static Int bbcc_table_size = 0;
static Int bbcc_table_entries = 0;
static BBCC **bbcc_table = 0;


/* Object hash table, fixed */
static obj_node* obj_table[N_OBJ_ENTRIES];


/* Stack of current thread. Gets initialized when switching to 1st thread.
 *
 * The artificial call stack is an array of call_entry's, representing
 * stack frames of the executing program. 
 * Array call_stack and call_stack_esp have same size and grow on demand.
 * Array call_stack_esp holds ESPs of corresponding stack frames.
 *
 */
static call_entry* call_stack;
static Int   call_stack_size;
/* number of current stack frame */
static Int   call_stack_sp;

/* this is != 0 if in skipped function. See call_entry->nonskipped */
static BBCC* current_nonskipped = 0;




/*------------------------------------------------------------*/
/*--- Function active counter array and info table         ---*/
/*------------------------------------------------------------*/


/* ensure that active_array is big enough:
 *  <distinct_fns> is the hightest index, so <fn_active_array_size>
 *  has to be bigger than that.
 */
static void resize_fn_active_array()
{
    UInt* new;
    Int i, newsize;

    newsize = fn_active_array_size;
    while (newsize <= distinct_fns) newsize *=2;

    CT_DEBUG(0, "Resize fn_active_array: %d => %d\n", fn_active_array_size, newsize);

    new = VG_(malloc)(newsize * sizeof(UInt));
    for(i=0;i<fn_active_array_size;i++)
	new[i] = fn_active_array[i];
    while(i<newsize)
	new[i++] = 0;

    VG_(free)(fn_active_array);
    fn_active_array = new;
    fn_active_array_size = newsize;
    fn_active_array_resizes++;
}



static __inline
fn_info* new_fn_info()
{
    Int i;
    fn_info* new = VG_(malloc)(sizeof(fn_info));
    new->call_counter = 0;
    for (i = 0; i < N_BBCC2_ENTRIES; i++) {
	new->bbccs[i] = 0;
    }
    
    fn_info_table_entries++;

    return new;
}


static void resize_fn_info_table()
{
    fn_info** new;
    Int i, newsize;

    newsize = fn_info_table_size;
    while (newsize <= distinct_fns) newsize *=2;

    CT_DEBUG(0, "Resize fn_info_table: %d => %d\n", fn_info_table_size, newsize);

    new = VG_(malloc)(newsize * sizeof(fn_info*));
    for(i=0;i<fn_info_table_size;i++)
	new[i] = fn_info_table[i];
    while(i<newsize)
	new[i++] = 0;

    VG_(free)(fn_info_table);
    fn_info_table = new;
    fn_info_table_size = newsize;
    fn_info_table_resizes++;
}

static __inline__
fn_info* get_fn_info(fn_node* fn)
{
    fn_info* info;

    if (fn_info_table_size <= fn->number)
	resize_fn_info_table();

    info = fn_info_table[fn->number];
    if (!info)
	info = fn_info_table[fn->number] = new_fn_info();

    return info;
}

    
/*------------------------------------------------------------*/
/*--- Debug output                                         ---*/
/*------------------------------------------------------------*/


static void print_indent(int s)
{
    /* max of 40 spaces */
    char sp[] = "                                        ";
    if (s>40) s=40;
    VG_(printf)(sp+40-s);
}

static void print_bb(int s, BB* bb)
{
    if (s<0) {
	s = -s;
	print_indent(s);
    }

    VG_(printf)("BB 0x%08x (Obj '%s')", bb->addr, bb->obj->name);
}


static Char* mangled_cxt(Context* cxt, int rec_index)
{
    static Char mangled[FN_NAME_LEN];
    int i, p;

    if (!cxt) return "(no context)";

    p = VG_(sprintf)(mangled, "%s", cxt->fn[0]->name);
    if (rec_index >0)
	p += VG_(sprintf)(mangled+p, "'%d", rec_index +1);
    for(i=1;i<cxt->size;i++)
	p += VG_(sprintf)(mangled+p, "'%s", cxt->fn[i]->name);

    return mangled;
}





static void print_cxt(int s, Context* cxt, int rec_index)
{
    if (s<0) {
	s = -s;
	print_indent(s);
    }

    if (cxt) {
	CT_ASSERT(cxt->fn[0]->number < fn_active_array_size);
	CT_ASSERT(rec_index < cxt->fn[0]->fn_recursion);

	VG_(printf)("Cxt %d (rmax %d, cmax %d) - act %d: %s",
		    cxt->base_number + rec_index,
		    cxt->fn[0]->fn_recursion, cxt->size-1,
		    fn_active_array[cxt->fn[0]->number], 
		    mangled_cxt(cxt, rec_index));
	VG_(printf)("\n");
    }
    else
	VG_(printf)("(no context)\n");
}


static void print_bbcc(int s, BBCC* bbcc)
{
    if (s<0) {
	s = -s;
	print_indent(s);
    }

    if (!bbcc) {
	VG_(printf)("BBCC 0x0\n");
	return;
    }

    VG_(printf)("BBCC 0x%p for BB 0x%x, exe %llu, ret %llu\n",
		bbcc, bbcc->bb->addr,
		bbcc->exe_counter, bbcc->ret_counter);

    print_cxt(-(s+8), bbcc->cxt, bbcc->rec_index);
}

static void print_fcc(int s, fCC* fcc)
{
    if (s<0) {
	s = -s;
	print_indent(s);
    }
    VG_(printf)("CC Ir A %llu, L1Miss %llu, L2Miss %llu\n",
		fcc->Ir.a, fcc->Ir.m1, fcc->Ir.m2);
    print_indent(s+3);
    VG_(printf)("Dr A %llu, L1Miss %llu, L2Miss %llu\n",
		fcc->Dr.a, fcc->Dr.m1, fcc->Dr.m2);
    print_indent(s+3);
    VG_(printf)("Dw A %llu, L1Miss %llu, L2Miss %llu\n",
		fcc->Dw.a, fcc->Dw.m1, fcc->Dw.m2);
}

static void print_short_jcc(jCC* jcc)
{
    if (jcc)
	VG_(printf)("%x => %x [%llu/%llu,%llu,%llu]",
		    jcc->from->bb->jmp_addr,
		    jcc->to->bb->addr,
		    jcc->call_counter,
		    jcc->sum.Ir.a,
		    jcc->sum.Dr.a,
		    jcc->sum.Dw.a);
    else
	VG_(printf)("[Skipped JCC]");
}

static void print_jcc(int s, jCC* jcc)
{
    if (s<0) {
	s = -s;
	print_indent(s);
    }

    if (!jcc) {
	VG_(printf)("JCC to skipped function");
	return;
    }
    VG_(printf)("JCC from ");
    print_bbcc(s+9, jcc->from);
    print_indent(s+4);    
    VG_(printf)("to   ");
    print_bbcc(s+9, jcc->to);
    print_indent(s+4);    
    VG_(printf)("with call count %llu\n", jcc->call_counter);
    print_indent(s+4);    
    VG_(printf)("cost ");
    print_fcc(s+9, &(jcc->sum));
}

/* dump out the current call stack */
static void print_stackentry(int s, int sp)
{
    if (s<0) {
	s = -s;
	print_indent(s);
    }
    VG_(printf)("[%-2d] ESP 0x%x", sp, call_stack[sp].esp);
    if (call_stack[sp].nonskipped)
	VG_(printf)(" NonSkipped BB 0x%x / %s",
		    call_stack[sp].nonskipped->bb->addr,
		    call_stack[sp].nonskipped->cxt->fn[0]->name);
    VG_(printf)("\n");
    print_indent(s+5);
    print_jcc(5,call_stack[sp].jcc);
}

/* debug output */
static void print_call_stack()
{
    int c;

    VG_(printf)("Call Stack:\n");
    for(c=0;c<call_stack_sp;c++)
	print_stackentry(-2, c);
}



/*------------------------------------------------------------*/
/*--- Generic utility stuff                                ---*/
/*------------------------------------------------------------*/


 
static Bool get_debug_info(Addr instr_addr, 
			   Char filename[FILENAME_LEN],
			   Char fn_name[FN_NAME_LEN], Int* line_num,
			   SegInfo** pSegInfo)
{
  Bool found1, found2;
  int line;
  
  CT_DEBUG(4, "  + get_debug_info(0x%08x)\n", instr_addr);

  if (pSegInfo) {
      *pSegInfo = VG_(get_obj)(instr_addr);

      // for generated code in anonymous space, pSegInfo is 0
   }

   found1 = VG_(get_filename_linenum)(instr_addr, filename,
				      FILENAME_LEN, &line);
   found2 = VG_(get_fnname)(instr_addr, 
			    fn_name, FN_NAME_LEN);

   if (!found1 && !found2) {
      no_debug_BBs++;
      VG_(strcpy)(filename, "???");
      VG_(strcpy)(fn_name,  "???");
      if (line_num) *line_num=0;
      return False;

   } else if ( found1 &&  found2) {
      full_debug_BBs++;
      if (line_num) *line_num=line;
      return True;

   } else if ( found1 && !found2) {
      file_line_debug_BBs++;
      VG_(strcpy)(fn_name,  "???");
      if (line_num) *line_num=line;
      return True;

   } else  /*(!found1 &&  found2)*/ {
      fn_name_debug_BBs++;
      VG_(strcpy)(filename, "???");
      if (line_num) *line_num=0;
      return True;
   }
}


static void print_bbcc_fn(BBCC* bbcc)
{
    int i=0, opos=0;
    const UChar* obj_name;

    if (!bbcc) {
	VG_(printf)("%08x", 0);
	return;
    }

    obj_name = bbcc->cxt->fn[0]->file->obj->name;
    while(obj_name[i]) {
	if (obj_name[i]=='/') opos = i+1;
	i++;
    }

    VG_(printf)("%08x/%c  %d:%s", bbcc->bb->addr, 
		(bbcc->bb->sect_kind == Vg_SectText) ? 'T' :
		(bbcc->bb->sect_kind == Vg_SectData) ? 'D' :
		(bbcc->bb->sect_kind == Vg_SectBSS) ? 'B' :
		(bbcc->bb->sect_kind == Vg_SectGOT) ? 'G' :
		(bbcc->bb->sect_kind == Vg_SectPLT) ? 'P' : 'U',
		bbcc->cxt->base_number+bbcc->rec_index,
		mangled_cxt(bbcc->cxt, bbcc->rec_index));

    if (obj_name[0])
	VG_(printf)(" %s", obj_name+opos);

    if (VG_(strcmp)(bbcc->cxt->fn[0]->file->name, "???") !=0) {
	VG_(printf)(" %s", bbcc->cxt->fn[0]->file->name);
	if ((bbcc->cxt->fn[0] == bbcc->bb->fn) && (bbcc->bb->line>0))
	    VG_(printf)(":%d", bbcc->bb->line);
    }
}	

/* dump out an address with source info if available */
static void print_addr(Addr addr)
{
    Char fl_buf[FILENAME_LEN];
    Char fn_buf[FN_NAME_LEN];
    const UChar* obj_name;
    SegInfo* si;
    int ln, i=0, opos=0;
	
    if (addr == 0) {
	VG_(printf)("%08x", addr);
	return;
    }

    get_debug_info(addr, fl_buf, fn_buf, &ln, &si);
    obj_name = VG_(seg_filename)(si);
    while(obj_name[i]) {
	if (obj_name[i]=='/') opos = i+1;
	i++;
    }
    
    if (VG_(strcmp)(fn_buf,"???")==0)
	VG_(printf)("%x", addr);
    else
	VG_(printf)("%x %s", addr, fn_buf);

    if (obj_name[0])
	VG_(printf)(" %s", obj_name+opos);

    if (ln>0)
    	VG_(printf)(" (%s:%u)", fl_buf,ln);
}

static void print_addr_ln(Addr addr)
{
    print_addr(addr);
    VG_(printf)("\n");
}



/*------------------------------------------------------------*/
/*--- Simple Cost center types, operations                 ---*/
/*------------------------------------------------------------*/


static Int costtype_register(UChar* name, Bool first_of_cc)
{
    if (costtype_count == N_MAX_COSTTYPE) {
	VG_(printf)("\nMore than %d cost types used!\n"
		    "Increase N_MAX_COSTTYPE in ct_main.c and recomile skin!\n",
		    N_MAX_COSTTYPE);
	VG_(skin_panic)("Too many cost types requested.");
    }
    costtype_name[costtype_count] = (UChar*) VG_(strdup)(name);
    costtype_first_of_cc[costtype_count] = first_of_cc;
    costtype_count++;

    return costtype_count-1;
}

static Int get_costtype(UChar* name)
{
    Int r = costtype_count;
    while(r>0) {
	r--;
	if (VG_(strcmp)(name,costtype_name[r]) == 0) return r;
    }
    return costtype_register(name, False);
}



static __inline__ void init_cc(CC* cc) {
    cc->a  = 0;
    cc->m1 = 0;
    cc->m2 = 0;
}
 
static __inline__
Bool is_zero_cc(CC* cc)
{
    if (cc->a == 0) {
#if ENABLE_CC_ASSERTIONS
	CT_ASSERT(cc->m1 == 0);
	CT_ASSERT(cc->m2 == 0);
#endif
	return True;
    }
#if ENABLE_CC_ASSERTIONS
    if (cc->m1 == 0)
	CT_ASSERT(cc->m2 == 0);
#endif
    return False;
}

static __inline__
Bool is_equal_cc(CC* cc1, CC* cc2)
{
    if (cc1->a != cc2->a) return False;
    if (cc1->a == 0) return True;
    if (cc1->m1 != cc2->m1) return False;
    if (cc1->m1 == 0) return True;
    if (cc1->m2 != cc2->m2) return False;
    return True;
}

static __inline__ 
void copy_cc(CC* dst, CC* src)
{
    dst->a  = src->a;
    dst->m1 = src->m1;
    dst->m2 = src->m2;
}


/* add cost center src to dst.
*/
static __inline__ 
void add_cc(CC* dst, CC* src)
{
    /* no cache access -> no L1/L2 miss */
    if (!src->a) {
#if ENABLE_CC_ASSERTIONS
	CT_ASSERT(src->m1 == 0);
	CT_ASSERT(src->m2 == 0);
#endif
	return;
    }
    dst->a += src->a;

    /* no L1 miss -> no L2 miss */
    if (!src->m1) {
#if ENABLE_CC_ASSERTIONS
	CT_ASSERT(src->m2 == 0);
#endif
	return;
    }
    dst->m1 += src->m1;

    if (!src->m2) return;
    dst->m2 += src->m2;
}


/* subtract cost center src from dst.
*/
static
void sub_cc(CC* dst, CC* src)
{
    /* no cache access -> no L1/L2 miss */
    if (!src->a) {
#if ENABLE_CC_ASSERTIONS
	CT_ASSERT(src->m1 == 0);
	CT_ASSERT(src->m2 == 0);
#endif
	return;
    }
    CT_ASSERT( dst->a >= src->a);
    dst->a -= src->a;

    /* no L1 miss -> no L2 miss */
    if (!src->m1) {
#if ENABLE_CC_ASSERTIONS
	CT_ASSERT(src->m2 == 0);
#endif
	return;
    }
    CT_ASSERT( dst->m1 >= src->m1);
    dst->m1 -= src->m1;

    if (!src->m2) return;
    CT_ASSERT( dst->m2 >= src->m2);
    dst->m2 -= src->m2;
}


/* add cost center src to dst and set src to 0.
 * returns false if no addition needed (i.e src->a == 0).
*/
static __inline__ 
Bool add_and_zero_cc(CC* dst, CC* src)
{
    /* no cache access -> no L1/L2 miss */
    if (!src->a) {
#if ENABLE_CC_ASSERTIONS
	CT_ASSERT(src->m1 == 0);
	CT_ASSERT(src->m2 == 0);
#endif
	return False;
    }
    dst->a += src->a;
    src->a = 0;

    /* no L1 miss -> no L2 miss */
    if (!src->m1) {
#if ENABLE_CC_ASSERTIONS
	CT_ASSERT(src->m2 == 0);
#endif
	return True;
    }
    dst->m1 += src->m1;
    src->m1 = 0;

    if (!src->m2) return True;
    dst->m2 += src->m2;
    src->m2 = 0;

    return True;
}

/* add difference of two cost centers (new-old) to another cost center
 * and make cost center old same as new.
 * returns false if no addition happened (i.e new == old).
 */
static __inline__ 
Bool add_diff_cc(CC* dst, CC* old, CC* new)
{
    Long diff;

    diff = new->a - old->a;
    /* no cache access -> no L1/L2 miss */
    if (!diff) {
#if ENABLE_CC_ASSERTIONS
	CT_ASSERT(new->m1 - old->m1 == 0);
	CT_ASSERT(new->m2 - old->m2 == 0);
#endif
	return False;
    }
    dst->a += diff;
    old->a = new->a;

    diff = new->m1 - old->m1;
    /* no L1 miss -> no L2 miss */
    if (!diff) {
#if ENABLE_CC_ASSERTIONS
	CT_ASSERT(new->m2 - old->m2 == 0);
#endif
	return True;
    }
    dst->m1 += diff;
    old->m1 = new->m1;

    diff = new->m2 - old->m2;
    if (!diff) return True;
    dst->m2 += diff;
    old->m2 = new->m2;

    return True;
}




static
void init_fcc(fCC* fcc)
{
    init_cc( &(fcc->Ir) );
    init_cc( &(fcc->Dr) );
    init_cc( &(fcc->Dw) );
}


static
Bool is_zero_fcc(fCC* fcc)
{
    if (!fcc || (is_zero_cc( &(fcc->Ir) ) &&
		 is_zero_cc( &(fcc->Dr) ) &&
		 is_zero_cc( &(fcc->Dw) )) )

	return True;

    return False;
}

static
Bool is_equal_fcc(fCC* fcc1, fCC* fcc2)
{
    if (is_equal_cc( &(fcc1->Ir), &(fcc2->Ir)) &&
	is_equal_cc( &(fcc1->Dr), &(fcc2->Dr)) &&
	is_equal_cc( &(fcc1->Dw), &(fcc2->Dw)))

	return True;

    return False;
}    

static
void copy_fcc(fCC* dst, fCC* src)
{
    copy_cc( &(dst->Ir), &(src->Ir) );
    copy_cc( &(dst->Dr), &(src->Dr) );
    copy_cc( &(dst->Dw), &(src->Dw) );
}

static
void add_fcc(fCC* dst, fCC* src)
{
    add_and_zero_cc( &(dst->Ir), &(src->Ir) );
    add_and_zero_cc( &(dst->Dr), &(src->Dr) );
    add_and_zero_cc( &(dst->Dw), &(src->Dw) );
}

static
void sub_fcc(fCC* dst, fCC* src)
{
    sub_cc( &(dst->Ir), &(src->Ir) );
    sub_cc( &(dst->Dr), &(src->Dr) );
    sub_cc( &(dst->Dw), &(src->Dw) );
}

/* Returns false if no addition happens */
static
Bool add_diff_fcc(fCC* dst, fCC* old, fCC* new)
{
    Bool changed = False;

    if (add_diff_cc( &(dst->Ir), &(old->Ir), &(new->Ir) )) changed = True;
    if (add_diff_cc( &(dst->Dr), &(old->Dr), &(new->Dr) )) changed = True;
    if (add_diff_cc( &(dst->Dw), &(old->Dw), &(new->Dw) )) changed = True;

    return changed;
}


#if 0

/* simple one */
static
int sprint_fcc(Char* buf, fCC* fcc)
{
    return VG_(sprintf)(buf, "%llu %llu %llu %llu %llu %llu %llu %llu %llu",
			fcc->Ir.a,  fcc->Dr.a,  fcc->Dw.a,
			fcc->Ir.m1, fcc->Dr.m1,	fcc->Dw.m1,
			fcc->Ir.m2, fcc->Dr.m2,	fcc->Dw.m2);
}

#else

/*
 * space saving...
 */
static
int sprint_fcc(Char* buf, fCC* fcc)
{
    int p = 0;
    int zeros = 0;

    /* print CC Ir */
    p = VG_(sprintf)(buf, "%llu", fcc->Ir.a);

    /* if no full simulation, output instruction access only */
    if (!clo_simulate_cache) return p;

    if (fcc->Dw.m2 == 0) {
	zeros++;
	if (fcc->Dr.m2 == 0) {
	    zeros++;
	    if (fcc->Ir.m2 == 0) {
		zeros++;
		if (fcc->Dw.m1 == 0) {
		    zeros++;
		    if (fcc->Dr.m1 == 0) {
			zeros++;
			if (fcc->Ir.m1 == 0) {
			    zeros++;
			    if (fcc->Dw.a == 0) {
				zeros++;
				if (fcc->Dr.a == 0)
				    zeros++;
			    }
			}
		    }
		}
	    }
	}
    }
    switch(zeros) {
    case 0:
	p += VG_(sprintf)(buf+p, " %llu %llu %llu %llu %llu %llu %llu %llu",
			  fcc->Dr.a,  fcc->Dw.a,
			  fcc->Ir.m1, fcc->Dr.m1, fcc->Dw.m1,
			  fcc->Ir.m2, fcc->Dr.m2, fcc->Dw.m2);
	break;
    case 1:
	p += VG_(sprintf)(buf+p, " %llu %llu %llu %llu %llu %llu %llu",
			  fcc->Dr.a,  fcc->Dw.a,
			  fcc->Ir.m1, fcc->Dr.m1, fcc->Dw.m1,
			  fcc->Ir.m2, fcc->Dr.m2);
	break;
    case 2:
	p += VG_(sprintf)(buf+p, " %llu %llu %llu %llu %llu %llu",
			  fcc->Dr.a,  fcc->Dw.a,
			  fcc->Ir.m1, fcc->Dr.m1, fcc->Dw.m1,
			  fcc->Ir.m2);
	break;
    case 3:
	p += VG_(sprintf)(buf+p, " %llu %llu %llu %llu %llu",
			  fcc->Dr.a,  fcc->Dw.a,
			  fcc->Ir.m1, fcc->Dr.m1, fcc->Dw.m1);
	break;
    case 4:
	p += VG_(sprintf)(buf+p, " %llu %llu %llu %llu",
			  fcc->Dr.a,  fcc->Dw.a,
			  fcc->Ir.m1, fcc->Dr.m1);
	break;
    case 5:
	p += VG_(sprintf)(buf+p, " %llu %llu %llu",
			  fcc->Dr.a,  fcc->Dw.a, fcc->Ir.m1);
	break;
    case 6:
	p += VG_(sprintf)(buf+p, " %llu %llu",
			  fcc->Dr.a,  fcc->Dw.a);
	break;
    case 7:
	p += VG_(sprintf)(buf+p, " %llu", fcc->Dr.a);
	break;
    default:
	break;
    }

    return p;
}
#endif




/*------------------------------------------------------------*/
/*--- Instruction-level Cost center types, operations      ---*/
/*------------------------------------------------------------*/

/* The typedefs for these structs are in vg_include.c 
 *
 * WARNING:  the 'tag' field *must* be the first byte of both CC types.
 * 
 * This is because we use it to work out what kind of CC we're dealing with.
 */ 

typedef 
   enum {
      InstrCC,         /* eg. mov %eax,   %ebx                      */
      ReadCC,          /* eg. mov (%ecx), %esi                      */
      WriteCC,         /* eg. mov %eax,   (%edx)                    */
      ModCC,           /* eg. incl (%eax) (read+write one addr)     */
      ReadWriteCC,     /* eg. call*l (%esi), pushl 0x4(%ebx), movsw 
                               (read+write two different addrs)      */
   } CC_type;


typedef
   struct _iCC {
      /* word 1 */
      UChar tag;
      UChar instr_size;
      /* 2 bytes padding */

      /* words 2+ */
      Addr instr_addr;
      CC I;
   }
   iCC;

typedef
   struct _idCC {
      /* word 1 */
      UChar tag;
      UChar instr_size;
      UChar data_size;
      /* 1 byte padding */

      /* words 2+ */
      Addr instr_addr;
      CC I;
      CC D;
   }
   idCC;

typedef
   struct _iddCC {
      /* word 1 */
      UChar tag;
      UChar instr_size;
      UChar data_size;
      /* 1 byte padding */

      /* words 2+ */
      Addr instr_addr;
      CC I;
      CC Da;
      CC Db;
   }
   iddCC;

static void init_iCC(iCC* cc, Addr instr_addr, UInt instr_size)
{
   cc->tag        = InstrCC;
   cc->instr_size = instr_size;
   cc->instr_addr = instr_addr;
   init_cc(&cc->I);
}

static void init_idCC(CC_type X_CC, idCC* cc, Addr instr_addr,
                      UInt instr_size, UInt data_size)
{
   cc->tag        = X_CC;
   cc->instr_size = instr_size;
   cc->data_size  = data_size;
   cc->instr_addr = instr_addr;
   init_cc(&cc->I);
   init_cc(&cc->D);
}

static void init_iddCC(iddCC* cc, Addr instr_addr,
                       UInt instr_size, UInt data_size)
{
   cc->tag        = ReadWriteCC;
   cc->instr_size = instr_size;
   cc->data_size  = data_size;
   cc->instr_addr = instr_addr;
   init_cc(&cc->I);
   init_cc(&cc->Da);
   init_cc(&cc->Db);
}

          
/* If 1, address of each instruction is printed as a comment after its counts
 */
#define PRINT_INSTR_ADDRS 0

static __inline__ int sprint_iCC(Char* buf, iCC* cc)
{
#if PRINT_INSTR_ADDRS
   return VG_(sprintf)(buf, "%llu %llu %llu # %x\n",
		       cc->I.a, cc->I.m1, cc->I.m2, cc->instr_addr);
#else
   return VG_(sprintf)(buf, "%llu %llu %llu\n",
		       cc->I.a, cc->I.m1, cc->I.m2);
#endif
}

static __inline__ int sprint_read_or_mod_CC(Char* buf, idCC* cc)
{
#if PRINT_INSTR_ADDRS
    return VG_(sprintf)(buf, "%llu %llu %llu %llu %llu %llu # %x\n",
			cc->I.a, cc->I.m1, cc->I.m2, 
			cc->D.a, cc->D.m1, cc->D.m2, cc->instr_addr);
#else
    return VG_(sprintf)(buf, "%llu %llu %llu %llu %llu %llu\n",
			cc->I.a, cc->I.m1, cc->I.m2, 
			cc->D.a, cc->D.m1, cc->D.m2);
#endif
}

static __inline__ int sprint_write_CC(Char* buf, idCC* cc)
{
#if PRINT_INSTR_ADDRS
    return VG_(sprintf)(buf, "%llu %llu %llu . . . %llu %llu %llu # %x\n",
			cc->I.a, cc->I.m1, cc->I.m2, 
			cc->D.a, cc->D.m1, cc->D.m2, cc->instr_addr);
#else
    return VG_(sprintf)(buf, "%llu %llu %llu . . . %llu %llu %llu\n",
			cc->I.a, cc->I.m1, cc->I.m2, 
			cc->D.a, cc->D.m1, cc->D.m2);
#endif
}

static __inline__ int sprint_read_write_CC(Char* buf, iddCC* cc)
{
#if PRINT_INSTR_ADDRS
    return VG_(sprintf)(buf, "%llu %llu %llu %llu %llu %llu # %x\n",
			cc->I.a,  cc->I.m1,  cc->I.m2, 
			cc->Da.a, cc->Da.m1, cc->Da.m2,
			cc->Db.a, cc->Db.m1, cc->Db.m2, cc->instr_addr);
#else
    return VG_(sprintf)(buf, "%llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
			cc->I.a,  cc->I.m1,  cc->I.m2, 
			cc->Da.a, cc->Da.m1, cc->Da.m2,
			cc->Db.a, cc->Db.m1, cc->Db.m2);
#endif
}



/*------------------------------------------------------------*/
/*--- Jump Cost Center (JCC) operations                    ---*/
/*------------------------------------------------------------*/



__inline__
static UInt jcc_hash(BBCC* from, BBCC* to, UInt size)
{
    return ( (UInt)from + 7* (UInt)to ) % size;
} 

/* double size of jcc table  */
static void resize_jcc_table()
{
    Int i, new_size, conflicts1 = 0, conflicts2 = 0;
    jCC** new_table;
    UInt new_hash;
    jCC *curr_jcc, *next_jcc;

    new_size  = 2* jcc_table_size +3;
    new_table = (jCC**) VG_(malloc)(new_size * sizeof(jCC*));
 
    if (!new_table) return;
 
    for (i = 0; i < new_size; i++)
      new_table[i] = NULL;
 
    for (i = 0; i < jcc_table_size; i++) {
	if (jcc_table[i] == NULL) continue;
 
	curr_jcc = jcc_table[i];
	while (NULL != curr_jcc) {
	    next_jcc = curr_jcc->next_hash;

	    new_hash = jcc_hash(curr_jcc->from, 
				curr_jcc->to, new_size);

	    curr_jcc->next_hash = new_table[new_hash];
	    new_table[new_hash] = curr_jcc;
	    if (curr_jcc->next_hash) {
		conflicts1++;
		if (curr_jcc->next_hash->next_hash)
		    conflicts2++;
	    }

	    curr_jcc = next_jcc;
	}
    }

    VG_(free)(jcc_table);


    CT_DEBUG(0, "Resize JCC Hash: %d => %d (entries %d, conflicts %d/%d)\n",
	     jcc_table_size, new_size,
	     jcc_table_entries, conflicts1, conflicts2);

    jcc_table_size = new_size;
    jcc_table      = new_table;
    jcc_table_resizes++;
}



/* new jCC structure: a call was done to a BB of a BBCC 
 * for a spontaneous call, from is 0 (i.e. caller unknown)
 */
static jCC* new_jcc(BBCC* from, BBCC* to)
{
   jCC* new;
   UInt new_hash;

   /* check fill degree of jcc hash table and resize if needed (>80%) */
   jcc_table_entries++;
   if (10 * jcc_table_entries / jcc_table_size > 8)
       resize_jcc_table();

   new = VG_(malloc)(sizeof(jCC));

   new->from    = from;
   new->to      = to;
   new->jmpkind = JmpCall;
   new->call_counter = 0;
   init_fcc( &(new->sum) );

   /* insert into JCC chain of calling BBCC.
    * This list is only used at dumping time */

   if (from) {
       new->next_from = from->jcc_list;
       from->jcc_list = new;
   }
   else {
       new->next_from = jcc_spontaneous;
       jcc_spontaneous = new;
   }

   /* insert into JCC hash table */
   new_hash = jcc_hash(from, to, jcc_table_size);
   new->next_hash = jcc_table[new_hash];
   jcc_table[new_hash] = new;

   CT_DEBUGIF(3) {
      VG_(printf)("new_jcc: ");
      print_jcc(9, new);
   }

   return new;
}


/* get the jCC for a call arc (BBCC->BBCC) */
static jCC* get_jcc(BBCC* from, BBCC* to)
{
    jCC* jcc;
    Int hash;

    CT_DEBUG(5, "+get_jcc(bbcc 0x%p => bbcc 0x%p)\n",
		from, to);

    /* first check last recently used JCC */
    jcc = to->lru_to_jcc;
    if (jcc && (jcc->from == from)) {
	CT_ASSERT(to == jcc->to);
	CT_DEBUG(5,"-get_jcc: [LRU to] jcc 0x%p\n", jcc);
	return jcc;
    }

    jcc = from->lru_from_jcc;
    if (jcc && (jcc->to == to)) {
	CT_ASSERT(from == jcc->from);
	CT_DEBUG(5, "-get_jcc: [LRU from] jcc 0x%p\n", jcc);
	return jcc;
    }

    jcc_lru_misses++;

    hash = jcc_hash(from, to, jcc_table_size);
    jcc = jcc_table[hash];

    while(jcc) {
	if ((jcc->from == from) && 
	    (jcc->to == to)) break;
	jcc = jcc->next_hash;
    }

    if (!jcc)
	jcc = new_jcc(from, to);

    /* set LRU */
    from->lru_from_jcc = jcc;
    to->lru_to_jcc = jcc;

    CT_DEBUG(5, "-get_jcc: jcc 0x%p\n", jcc);

    return jcc;
}


/*------------------------------------------------------------*/
/*--- Object/File/Function hash entry operations           ---*/
/*------------------------------------------------------------*/

static void init_obj_table()
{
    Int i;
    for (i = 0; i < N_OBJ_ENTRIES; i++)
	obj_table[i] = 0;
}

#define HASH_CONSTANT   256

static UInt str_hash(const UChar *s, UInt table_size)
{
    int hash_value = 0;
    for ( ; *s; s++)
        hash_value = (HASH_CONSTANT * hash_value + *s) % table_size;
    return hash_value;
}


static UChar* anonymous_obj = "???";

static __inline__ 
obj_node* new_obj_node(SegInfo* si, obj_node* next)
{
   Int i;
   obj_node* new = VG_(malloc)(sizeof(obj_node));
   new->name  = si ? (UChar*)VG_(strdup)( VG_(seg_filename)(si) )
                     : anonymous_obj;
   for (i = 0; i < N_FILE_ENTRIES; i++) {
      new->files[i] = NULL;
   }
   new->number  = ++distinct_objs;
   new->start   = si ? VG_(seg_start)(si) : 0;
   new->size    = si ? VG_(seg_size)(si) : 0;
   new->offset  = si ? VG_(seg_sym_offset)(si) : 0;
   new->next    = next;
   return new;
}


static
obj_node* get_obj_node(SegInfo* si)
{
    obj_node*    curr_obj_node;
    UInt         objname_hash;
    const UChar* obj_name;
    
    obj_name = si ? VG_(seg_filename)(si) : anonymous_obj;

    /* lookup in obj hash */
    objname_hash = str_hash(obj_name, N_OBJ_ENTRIES);
    curr_obj_node = obj_table[objname_hash];
    while (NULL != curr_obj_node && 
	   VG_(strcmp)(obj_name, curr_obj_node->name) != 0) {
	curr_obj_node = curr_obj_node->next;
    }
    if (NULL == curr_obj_node) {
	obj_table[objname_hash] = curr_obj_node = 
	    new_obj_node(si, obj_table[objname_hash]);
    }

    return curr_obj_node;
}


static __inline__ 
file_node* new_file_node(Char filename[FILENAME_LEN],
			 obj_node* obj, file_node* next)
{
    Int i;
    file_node* new = VG_(malloc)(sizeof(file_node));
   new->name  = VG_(strdup)(filename);
    for (i = 0; i < N_FN_ENTRIES; i++) {
       new->fns[i] = NULL;
    }
   new->number  = ++distinct_files;
   new->obj     = obj;
   new->next      = next;
   return new;
}

 
static __inline__
file_node* get_file_node(obj_node* curr_obj_node,
			 Char filename[FILENAME_LEN])
{
    file_node* curr_file_node;
    UInt       filename_hash;

    /* lookup in file hash */
    filename_hash = str_hash(filename, N_FILE_ENTRIES);
    curr_file_node = curr_obj_node->files[filename_hash];
    while (NULL != curr_file_node && 
	   VG_(strcmp)(filename, curr_file_node->name) != 0) {
	curr_file_node = curr_file_node->next;
    }
    if (NULL == curr_file_node) {
	curr_obj_node->files[filename_hash] = curr_file_node = 
	    new_file_node(filename, curr_obj_node, 
			  curr_obj_node->files[filename_hash]);
    }

    return curr_file_node;
}


static __inline__ 
fn_node* new_fn_node(Char fnname[FILENAME_LEN],
		     file_node* file, fn_node* next)
{
    fn_node* new = VG_(malloc)(sizeof(fn_node));
    new->name = VG_(strdup)(fnname);

    new->number   = ++distinct_fns;
    new->last_cxt = 0;
    new->pure_cxt = 0;
    new->file     = file;
    new->next     = next;

    new->dump_before  = False;
    new->dump_after   = False;
    new->zero_before  = False;
    new->toggle_collect = False;
    new->skip         = False;
    new->group        = 0;
    new->fn_caller    = clo_fn_caller;
    new->fn_recursion = clo_fn_recursion;

#if JCC_DEBUG
    new->verbosity    = CONFIG_DEFAULT; /* no change */
#endif

    if (distinct_fns >= fn_active_array_size)
	resize_fn_active_array();

    return new;
}


/* Get a function node in hash2 with known file node.
 * hash nodes are created if needed
 */
static
fn_node* get_fn_node(file_node* curr_file_node,
		     Char fnname[FN_NAME_LEN])
{
    fn_node* curr_fn_node;
    UInt     fnname_hash;

    CT_ASSERT(curr_file_node != 0);

    /* lookup in function hash */
    fnname_hash = str_hash(fnname, N_FN_ENTRIES);
    curr_fn_node = curr_file_node->fns[fnname_hash];
    while (NULL != curr_fn_node && 
	   VG_(strcmp)(fnname, curr_fn_node->name) != 0) {
	curr_fn_node = curr_fn_node->next;
    }
    if (NULL == curr_fn_node) {
	curr_file_node->fns[fnname_hash] = curr_fn_node = 
            new_fn_node(fnname, curr_file_node,
			curr_file_node->fns[fnname_hash]);
    }

    return curr_fn_node;
}


/* Get a function node in hash2.
 * hash nodes are created if needed
 */
static __inline__
fn_node* get_fn_node2(SegInfo* si,
		      Char filename[FILENAME_LEN],
		      Char fnname[FN_NAME_LEN])
{
    obj_node  *obj  = get_obj_node(si);
    file_node *file = get_file_node(obj, filename);
    fn_node   *fn   = get_fn_node(file, fnname);

    return fn;
}


/*------------------------------------------------------------*/
/*--- fn_config operations                                 ---*/
/*------------------------------------------------------------*/

/* Configurations for function name prefix patterns.
 * Currently, only very limit patterns are possible:
 * Exact prefix patterns and "*::" are allowed.
 * E.g.
 *  - "abc" matches all functions starting with "abc".
 *  - "abc*::def" matches all functions starting with "abc" and
 *    starting with "def" after the first "::" separator.
 *  - "*::print(" matches C++ methods "print" in all classes
 *    without namespace. I.e. "*" doesn't match a "::".
 *
 * We build a trie from patterns, and for a given function, we
 * go down the tree and apply all non-default configurations.
 */


#define NODE_DEGREE 30

/* node of compressed trie search structure */
typedef struct _config_node config_node;
struct _config_node {
    Char* name;
    Int length;
    
    fn_config* config;
    config_node* sub_node[NODE_DEGREE];
    config_node* cpp_sep;
};

/* root of trie */
static config_node* fn_configs = 0;

static __inline__ 
fn_config* new_fnc(Char* name)
{
   fn_config* new = VG_(malloc)(sizeof(fn_config));

   new->name = VG_(strdup)(name);
   new->next = 0;

   new->dump_before  = CONFIG_DEFAULT;
   new->dump_after   = CONFIG_DEFAULT;
   new->zero_before  = CONFIG_DEFAULT;
   new->toggle_collect = CONFIG_DEFAULT;
   new->skip         = CONFIG_DEFAULT;
   new->group        = CONFIG_DEFAULT;
   new->fn_caller    = CONFIG_DEFAULT;
   new->fn_recursion = CONFIG_DEFAULT;

#if JCC_DEBUG
   new->verbosity    = CONFIG_DEFAULT;
#endif

   CT_DEBUG(2, "  new_fnc('%s')\n", name);

   return new;
}


static fn_config* get_fnc1(fn_config** pfnc, Char* name)
{
    fn_config* fnc = *pfnc;
    while(fnc) {
	if (VG_(strcmp)(fnc->name,name) == 0) break;
	fnc = fnc->next;
    }
    if (!fnc) {
	fnc = new_fnc(name);
	fnc->next = *pfnc;
	*pfnc = fnc;
    }
    return fnc;
}


static config_node* new_config(Char* name, int length)
{
    int i;
    config_node* node = VG_(malloc)(sizeof(config_node));

    node->name   = name;
    node->length = length;
    node->config = 0;
    for(i=0;i<NODE_DEGREE;i++)
	node->sub_node[i] = 0;
    node->cpp_sep = 0;

    CT_DEBUG(2, "  new_config('%s', len %d)\n", name, length);

    return node;
}

/* Search first wildcard, with optionally appended cpp separator ("::").
 * if not found, *(return) is 0
 */
static Char* next_wild_cpp_sep(Char* n)
{
    while(*n!=0) {
	if (*n == '*') {
	    n++;
	    if (*n == ':') {
		n++;
		if (*n == ':') {
		    n++;
		}
	    }
	    return n;
	}
	n++;
    }
    return n;
}

/* Get function config for a specific name in search tree */
static fn_config* get_fnc2(config_node** pnode, Char* name, int offset)
{
    config_node* node;
    fn_config* fnc;
    Char* cpp_sep;

    CT_ASSERT(pnode != 0);
    node = *pnode;

    if (node == 0) {
	fnc = new_fnc(name);
	cpp_sep = next_wild_cpp_sep(name+offset);
	if (*cpp_sep) {
	    /* found wildcard */
	    int newOffset = cpp_sep - name;
	    node = new_config(name, offset);
	    node->config = fnc;
	    *pnode = node;
	    CT_DEBUG(3, "  get_config('%s', off %d, new %d): cpp_sep\n",
		     name, offset, newOffset);
	    /* recurse */
	    return get_fnc2( &(node->cpp_sep), cpp_sep, newOffset);
	}
	node = new_config(fnc->name, VG_(strlen)(fnc->name));
	node->config = fnc;
	*pnode = node;

	CT_DEBUG(3, "  get_config('%s', offset %d): new\n",
		 name, offset);

	return fnc;
    }
    
    if (name[offset] == 0) {
	CT_DEBUG(3, "  get_config('%s', off %d): found\n", name, offset);
	return get_fnc1(&node->config, name);
    }

    if (name[offset] == '*') {
	/* the "::" is optional in the pattern */
	offset++;
	if (name[offset] == ':') {
	    offset++;
	    if (name[offset] == ':') offset++;
	}
	CT_DEBUG(3, "  get_config('%s', off %d): cpp_sep\n", name, offset);
	return get_fnc2( &(node->cpp_sep), name+offset, 0);
    }

    /* FIXME: Should handle wildcard */
    while(offset < node->length) {
	if ((name[offset] == 0)  ||
	    ((name[offset]%NODE_DEGREE) != (node->name[offset]%NODE_DEGREE)))
	    break;
	offset++;
    }

    /* name is shorter or different as nodes name */
    if (offset < node->length) {
	config_node* new;

	/* split up this node */
	fnc = new_fnc(name);
	new = new_config(node->name, offset);

	new->sub_node[ node->name[offset]%NODE_DEGREE ] = node;
	    
	if (name[offset]==0) {
	    /* no subnode, insert into new node */
	    new->config = fnc;
	}
	else {
	    config_node* new2 = new_config(node->name, VG_(strlen)(name));
	    new->sub_node[ name[offset]%NODE_DEGREE ] = new2;
	    new2->config = fnc;
	}
	*pnode = new;

	CT_DEBUG(3, "  get_config('%s', off %d): splitted\n", name, offset);

	return fnc;
    }

    /* name and node name are congruent */
    if (name[offset] == 0) {
	CT_DEBUG(3, "  get_config('%s', off %d): found\n", name, offset);
	return get_fnc1(&node->config, name);
    }

    /* name is longer than the nodes name: append new node */
    CT_DEBUG(3, "  get_config('%s', off %d): next\n", name, offset);

    /* recurse */
    return get_fnc2( &(node->sub_node[ name[offset]%NODE_DEGREE ]),
		     name, offset);
}

/* get a function config for a given prefix name */
static fn_config* get_fnc(Char* name)
{
    return get_fnc2(&fn_configs, name, 0);
}


static void update_fn_config1(fn_node* fn, fn_config* fnc)
{
    if (fnc->dump_before != CONFIG_DEFAULT)
	fn->dump_before = (fnc->dump_before == CONFIG_TRUE);

    if (fnc->dump_after != CONFIG_DEFAULT)
	fn->dump_after = (fnc->dump_after == CONFIG_TRUE);

    if (fnc->zero_before != CONFIG_DEFAULT)
	fn->zero_before = (fnc->zero_before == CONFIG_TRUE);

    if (fnc->toggle_collect != CONFIG_DEFAULT)
	fn->toggle_collect = (fnc->toggle_collect == CONFIG_TRUE);

    if (fnc->skip != CONFIG_DEFAULT)
	fn->skip = (fnc->skip == CONFIG_TRUE);

    if (fnc->group != CONFIG_DEFAULT)
	fn->group = fnc->group;

    if (fnc->fn_caller != CONFIG_DEFAULT)
	fn->fn_caller = fnc->fn_caller;

    if (fnc->fn_recursion != CONFIG_DEFAULT)
	fn->fn_recursion = fnc->fn_recursion;

    if (fnc->verbosity != CONFIG_DEFAULT)
	fn->verbosity = fnc->verbosity;
}

/* Search first cpp separator ("::").
 * if not found, m[return] is 0
 */
static Int next_cpp_sep(Char* n)
{
    Int p = 0;
    while(n[p]!=0) {
	if (n[p] == ':') {
	    p++;
	    if (n[p] == ':') {
		p++;
		return p;
	    }
	}
	p++;
    }
    return p;
}

static void update_fn_config2(fn_node* fn, char* name, config_node* node)
{
    fn_config* fnc;
    int len, cpp_sep_pos = -1;

    len = next_cpp_sep(name);
    if (name[len]) {
	cpp_sep_pos = len;
	while(name[len]) len++;

	CT_DEBUG(3, "  update_fn_config('%s', off %d): cpp_sep\n",
		 name, cpp_sep_pos);
    }

    while(node && (node->length <= len)) {
	fnc = node->config;
	while(fnc) {
	    if (VG_(strncmp)(fnc->name,name,node->length) == 0) break;
	    fnc = fnc->next;
	}
    
	if (fnc) {
	    CT_DEBUG(3, "  update_fn_config('%s', len %d): found\n",
		     fnc->name, node->length);

	    update_fn_config1(fn, fnc);

	    /* recurse on wildcard patterns (depth first search).
	     * Better would be to build a state machine...
	     */
	    if (node->cpp_sep && cpp_sep_pos>0)
		update_fn_config2(fn, name + cpp_sep_pos, node->cpp_sep);
	}

	if (name[node->length] == 0) break;
	node = node->sub_node[ name[node->length]%NODE_DEGREE ];
    }
}

/* Update function config according to configs of name prefixes */
static void update_fn_config(fn_node* fn)
{
    CT_DEBUG(3, "  update_fn_config('%s')\n", fn->name);
    update_fn_config2(fn, fn->name, fn_configs);
}
	

/*------------------------------------------------------------*/
/*--- Basic block (BB) operations                          ---*/
/*------------------------------------------------------------*/


static void init_bb_table()
{
   Int i;

   bb_table_size    = N_BB_INITIAL_ENTRIES;
   bb_table_entries = 0;
   bb_table = (BB**) VG_(malloc)(bb_table_size * sizeof(BB*));

   for (i = 0; i < bb_table_size; i++) bb_table[i] = NULL;
}


/* double size of bb table  */
static void resize_bb_table()
{
    Int i, new_size, conflicts1 = 0, conflicts2 = 0;
    BB **new_table, *curr, *next;
    UInt new_hash;

    new_size  = 2* bb_table_size +3;
    new_table = (BB**) VG_(malloc)(new_size * sizeof(BB*));
 
    if (!new_table) return;
 
    for (i = 0; i < new_size; i++)
      new_table[i] = NULL;
 
    for (i = 0; i < bb_table_size; i++) {
	if (bb_table[i] == NULL) continue;
 
	curr = bb_table[i];
	while (NULL != curr) {
	    next = curr->next;

	    new_hash = curr->addr % new_size;

	    curr->next = new_table[new_hash];
	    new_table[new_hash] = curr;
	    if (curr->next) {
		conflicts1++;
		if (curr->next->next)
		    conflicts2++;
	    }

	    curr = next;
	}
    }

    VG_(free)(bb_table);


    CT_DEBUG(0, "Resize BB Hash: %d => %d (entries %d, conflicts %d/%d)\n",
	     bb_table_size, new_size,
	     bb_table_entries, conflicts1, conflicts2);

    bb_table_size = new_size;
    bb_table      = new_table;
    bb_table_resizes++;
}


/* forward decl. */
static fn_node* new_bb_fn(BB* bb);


/**
 * Allocate new BB structure
 *
 * Uninitialized: jmp_addr, array_size
 *
 * TODO: Allocate BB and BBCC together
 */
static BB* new_bb(Addr addr)
{
   BB* new;
   UInt new_hash;

   /* check fill degree of jcc hash table and resize if needed (>80%) */
   bb_table_entries++;
   if (10 * bb_table_entries / bb_table_size > 8)
       resize_bb_table();

   new = VG_(malloc)(sizeof(BB));

   new->addr       = addr;
   new->jmp_addr   = 0; // to be set in get_bbcc()
   new->size       = 0;
   new->sect_kind  = VG_(seg_sect_kind)(addr);
   new->obj        = get_obj_node( VG_(get_obj)(addr) );
   new->fn         = 0;
   new->line       = 0;
   new->is_entry   = 0;
   new->bbcc_list  = 0;
   new->last_bbcc  = 0;
   new->array_size = 0;
   new->instr_count = 0;

   /* insert into BB hash table */
   new_hash = addr % bb_table_size;
   new->next = bb_table[new_hash];
   bb_table[new_hash] = new;

#if JCC_DEBUG
   CT_DEBUGIF(3) {
      VG_(printf)("  new_bb: ");
      print_bb(0, new);
      VG_(printf)("\n");
   }
#endif

   distinct_bbs++;

   new_bb_fn(new);

   return new;
}


/* get the BB structure for a BB start address */
static __inline__
BB* lookup_bb(Addr addr)
{
    BB* bb;
    Int hash;

    hash = addr % bb_table_size;
    bb = bb_table[hash];

    while(bb) {
	if (bb->addr == addr) break;
	bb = bb->next;
    }

    CT_DEBUG(5, " lookup_bb: 0x%p\n", bb);
    return bb;
}


/* get the BB structure for a BB start address */
static BB* get_bb(Addr addr)
{
    BB* bb;
    Int hash;

    CT_DEBUG(5, "+ get_bb(0x%08x)\n", addr);

    hash = addr % bb_table_size;
    bb = bb_table[hash];

    while(bb) {
	if (bb->addr == addr) break;
	bb = bb->next;
    }

    if (!bb)
	bb = new_bb(addr);

    CT_DEBUG(5, "- get_bb: 0x%p\n", bb);

    return bb;
}

/**
 * Remove a BB structure
 *
 * TODO:
 * Remove BBCCs / JCCs !!
 */
static void remove_bb(BB* bb)
{
    bb->addr = 0;
}
    

/*------------------------------------------------------------*/
/*--- Context operations                                   ---*/
/*------------------------------------------------------------*/




static void init_cxt_table()
{
   Int i;

   cxt_table_size    = N_CXT_INITIAL_ENTRIES;
   cxt_table_entries = 0;
   cxt_table = (Context**) VG_(malloc)(cxt_table_size * sizeof(Context*));

   for (i = 0; i < cxt_table_size; i++) cxt_table[i] = NULL;
}


/* double size of cxt table  */
static void resize_cxt_table()
{
    Int i, new_size, conflicts1 = 0, conflicts2 = 0;
    Context **new_table, *curr, *next;
    UInt new_hash;

    new_size  = 2* cxt_table_size +3;
    new_table = (Context**) VG_(malloc)(new_size * sizeof(Context*));
 
    if (!new_table) return;
 
    for (i = 0; i < new_size; i++)
      new_table[i] = NULL;
 
    for (i = 0; i < cxt_table_size; i++) {
	if (cxt_table[i] == NULL) continue;
 
	curr = cxt_table[i];
	while (NULL != curr) {
	    next = curr->next;

	    new_hash = curr->hash % new_size;

	    curr->next = new_table[new_hash];
	    new_table[new_hash] = curr;
	    if (curr->next) {
		conflicts1++;
		if (curr->next->next)
		    conflicts2++;
	    }

	    curr = next;
	}
    }

    VG_(free)(cxt_table);


    CT_DEBUG(0, "Resize Context Hash: %d => %d (entries %d, conflicts %d/%d)\n",
	     cxt_table_size, new_size,
	     cxt_table_entries,	conflicts1, conflicts2);

    cxt_table_size = new_size;
    cxt_table      = new_table;
    cxt_table_resizes++;
}


__inline__
static UInt cxt_hash(fn_node** fn, UInt size)
{
    UInt hash = 0;
    UInt count = size;
    fn_node* f = *fn;
    while(*fn != 0) {
	hash = (hash<<7) + (hash>>25) + (UInt)(*fn);
	fn--;
	count--;
	if (count==0) break;
    }
    CT_DEBUG(6, "   cxt_hash(fn '%s', size %d): 0x%x\n",
		f->name, size, hash);
    return hash;
} 

__inline__
static Bool is_cxt(UInt hash, fn_node** fn, Context* cxt)
{
    int count;
    fn_node** cxt_fn;

    if (hash != cxt->hash) return False;

    count = cxt->size;
    cxt_fn = &(cxt->fn[0]);
    while((*fn != 0) && (count>0)) {
	if (*cxt_fn != *fn) return False;
	fn--;
	cxt_fn++;
	count--;
    }
    return True;
}
	    

/**
 * Allocate new Context structure
 */
static Context* new_cxt(fn_node** fn)
{
    Context* new;
    UInt idx, hash, offset;
    int size, recs;
    fn_node* top_fn;

    CT_ASSERT(fn);
    top_fn = *fn;
    if (top_fn == 0) return 0;

    size = top_fn->fn_caller +1;
    recs = top_fn->fn_recursion;
    if (recs<1) recs=1;

    /* check fill degree of context hash table and resize if needed (>80%) */
    cxt_table_entries++;
    if (10 * cxt_table_entries / cxt_table_size > 8)
	resize_cxt_table();

    new = VG_(malloc)(sizeof(Context)+sizeof(fn_node*)*(size-1));

    hash = 0;
    offset = 0;
    while(*fn != 0) {
	hash = (hash<<7) + (hash>>25) + (UInt)(*fn);
	new->fn[offset] = *fn;
	offset++;
	fn--;
	if (offset >= size) break;
    }
    if (offset < size) size = offset;
    
    new->size        = size;
    new->base_number = context_counter;
    new->hash        = hash;

    context_counter += recs;
    distinct_contexts++;

    /* insert into Context hash table */
    idx = hash % cxt_table_size;
    new->next = cxt_table[idx];
    cxt_table[idx] = new;

#if JCC_DEBUG
    CT_DEBUGIF(3) {
	VG_(printf)("  new_cxt: hash 0x%x ", hash);
	print_cxt(12, new, 0);
    }
#endif

    return new;
}


/* get the Context structure for current context */
static Context* get_cxt(fn_node** fn)
{
    Context* cxt;
    UInt hash, size, idx;

    CT_ASSERT(fn != 0);
    if (*fn == 0) return 0;
    size = (*fn)->fn_caller+1;

    CT_DEBUG(5, "+ get_cxt(fn '%s'): size %d\n",
		(*fn)->name, size);

    hash = cxt_hash(fn, size);

    if ( ((cxt = (*fn)->last_cxt) != 0) && is_cxt(hash, fn, cxt)) {
	CT_DEBUG(5, "- get_cxt: 0x%p\n", cxt);
	return cxt;
    }

    cxt_lru_misses++;

    idx = hash % cxt_table_size;
    cxt = cxt_table[idx];

    while(cxt) {
	if (is_cxt(hash,fn,cxt)) break;
	cxt = cxt->next;
    }

    if (!cxt)
	cxt = new_cxt(fn);
    
    (*fn)->last_cxt = cxt;

    CT_DEBUG(5, "- get_cxt: %p\n", cxt);

    return cxt;
}

/**
 * Change execution context by calling a new function from current context
 *
 */
static void push_cxt(fn_node* fn)
{
    /* save old context on stack (even if not changed at all!) */
    CT_ASSERT(call_stack_sp < call_stack_size);
    CT_ASSERT(call_stack[call_stack_sp].cxt == 0);
    call_stack[call_stack_sp].cxt = current_cxt;
    call_stack[call_stack_sp].fn_sp = fn_stack_top - fn_stack;

    if (*fn_stack_top == fn) return;
    if (fn && (fn->group>0) && ((*fn_stack_top)->group == fn->group)) return;

    /* resizing needed ? */
    if (fn_stack_top == fn_stack+fn_stack_size-1) {
	int new_size = fn_stack_size*2;
	fn_node** new = VG_(malloc)(new_size * sizeof(fn_node*));
	int i;
	for(i=0;i<fn_stack_size;i++)
	    new[i] = fn_stack[i];
	VG_(free)(fn_stack);
	fn_stack_top = (fn_stack_top-fn_stack) + new;
	fn_stack = new;

	CT_DEBUG(0, "Resize Context Stack: %d => %d (pushing '%s')\n", 
		 fn_stack_size, new_size,
		 fn ? fn->name : (Char*)"0x0");

	fn_stack_size = new_size;
    }

    if (*fn_stack_top == 0) {
	/* this is first function: increment its active count */
	CT_ASSERT(fn != 0);
	CT_ASSERT(fn->number < fn_active_array_size);
	fn_active_array[fn->number]++;
    }

    fn_stack_top++;
    *fn_stack_top = fn;
    current_cxt = get_cxt(fn_stack_top);

    CT_DEBUG(5, "  push_cxt(fn '%s'): %d\n", 
		fn ? fn->name : (Char*)"0x0", fn_stack_top-fn_stack);
}
			       


/*------------------------------------------------------------*/
/*--- Call stack, operations                               ---*/
/*------------------------------------------------------------*/



/* forward decl */
static void cachesim_flush(Char* trigger,Bool only_current_thread);
static void zero_ccs(Bool only_current_thread);

/*
 * Allocated space for skipped cost.
 * This is only needed for BBCC.skipped, if there is a CALL to a skipped
 * function. As this is not often the case (?), the FCC is allocated lazy.
 */
static __inline__
Skipped* new_skipped(BBCC* from, fn_node* fn, Skipped* next)
{
    Skipped* sk = (Skipped*)VG_(malloc)(sizeof(Skipped));
    sk->from = from;
    sk->fn = fn;
    sk->next = next;
    init_fcc(&(sk->fcc));

    distinct_skips++;

    return sk;
}


/* Push call on call stack.
 *
 * Increment the usage count for the function called.
 * A jump from <from> to <to>, with <esp>.
 * If <skip> is true, this is a call to a function to be skipped;
 * for this, we set jcc = 0.
 */
static void push_call_stack(BBCC* from, BBCC* to, Addr esp, Bool skip)
{
    jCC* jcc;
    Int depth;

    /* Ensure a call stack of size <call_stack_sp>+1.
     * The +1 is needed as push_cxt will store the context at [call_stack_sp]
     */
    if (call_stack_sp+1 == call_stack_size) {
	call_stack_size *= 2;
	call_stack = (call_entry*) VG_(realloc)(call_stack,
                       call_stack_size * sizeof(call_entry));

	CT_DEBUGIF(2)
	    VG_(printf)("        call stack enlarged to %d entries\n",
			call_stack_size);
    }

    if (skip) {
	jcc = 0;
    }
    else {
	fn_node* to_fn = to->cxt->fn[0];

	if (current_nonskipped) {
	    /* this is a jmp from skipped to nonskipped */
	    CT_ASSERT(current_nonskipped == from);
	}

	/* As push_cxt() has to be called before push_call_stack if not
	 * skipping, the old context should already be saved on the stack */
	CT_ASSERT(call_stack[call_stack_sp].cxt != 0);
	copy_fcc( &(call_stack[call_stack_sp].fcc), current_fcc );

	jcc = get_jcc(from, to);
	CT_ASSERT(jcc != 0);

#if 0
	/* not really needed, as call_counter isn't used */
	fn_info* toInfo;
	toInfo = get_fn_info(to_fn);
	toInfo->call_counter++;
#endif

	CT_ASSERT(to_fn->number < fn_active_array_size);
	if (clo_skip_direct_recursion) {
	    /* only increment depth if another function is called */
	    if (jcc->from->cxt->fn[0] != to_fn)
		fn_active_array[to_fn->number]++;
	}
	else fn_active_array[to_fn->number]++;
	depth = fn_active_array[to_fn->number];
	if (depth>1)
	    rec_call_counter++;
	
	jcc->call_counter++;
	call_counter++;

	if (depth == 1) {

#if JCC_DEBUG
	    if (to_fn->verbosity >=0) {
		Int old = clo_ct_verbose;
		clo_ct_verbose = to_fn->verbosity;
		to_fn->verbosity = old;
		VG_(message)(Vg_DebugMsg, 
			     "Entering %s: Verbosity set to %d",
			     to_fn->name, clo_ct_verbose);
	    }
#endif		
	    
	    if (to_fn->dump_before) {
		Char trigger[FN_NAME_LEN];
		VG_(sprintf)(trigger, "--dump-before=%s", 
			     to_fn->name);
		cachesim_flush(trigger, True);
	    }
	    else if (to_fn->zero_before) {
		zero_ccs(True);
	    }
	    if (to_fn->toggle_collect) {
		collect_state = !collect_state;
		CT_DEBUG(2,"   entering %s: toggled collection state to %s\n",
			 to_fn->name,
			 collect_state ? "ON" : "OFF");
	    }	
	}
    }

    /* put jcc on call stack */
    call_stack[call_stack_sp].jcc = jcc;
    call_stack[call_stack_sp].esp = esp;
    call_stack[call_stack_sp].nonskipped = current_nonskipped;

    call_stack_sp++;

    /* To allow for above assertion we set context of next frame to 0 */
    CT_ASSERT(call_stack_sp < call_stack_size);
    call_stack[call_stack_sp].cxt = 0;

    if (!skip)
	current_nonskipped = 0;
    else if (!current_nonskipped) {
	/* a call from nonskipped to skipped */
	current_nonskipped = from;
	if (!current_nonskipped->skipped)
	    current_nonskipped->skipped = new_skipped(from, to->bb->fn, 0);
    }

#if JCC_DEBUG
    CT_DEBUGIF(1) {
	if (clo_ct_verbose<4) {
	    VG_(printf)("+ %2d ", call_stack_sp);
	    print_short_jcc(jcc);
	    VG_(printf)(", ESP %x\n", esp);
	}
	else {
	    VG_(printf)("Pushed ");
	    print_stackentry(7, call_stack_sp-1);
	    VG_(printf)("       calling ");
	    print_addr_ln(to->bb->addr);
	}
    }
#endif

}


/* Pop call stack and update cumulative sums.
 * Returns modified fcc.
 *
 * If the JCC becomes inactive, call entries are freed if possible
 */
static void pop_call_stack()
{
    jCC* jcc;
    Int depth = 0;
    fCC* fcc = 0;

    CT_DEBUG(4,"+pop_call_stack: frame %d, jcc 0x%p\n", 
		call_stack_sp, call_stack[call_stack_sp-1].jcc);

    /* jCC item not any more on real stack: pop */
    jcc =                call_stack[call_stack_sp-1].jcc;
    current_nonskipped = call_stack[call_stack_sp-1].nonskipped;

    if (jcc) {
	fn_node* to_fn  = jcc->to->cxt->fn[0];

	CT_ASSERT(to_fn->number < fn_active_array_size);
	if (clo_skip_direct_recursion) {
	    /* only decrement depth if another function was called */
	    if (jcc->from->cxt->fn[0] != to_fn)
		fn_active_array[to_fn->number]--;
	}
	else fn_active_array[to_fn->number]--;
	depth = fn_active_array[to_fn->number];

	/* add cost difference to sum */
	fcc = &(jcc->sum);
	if ( add_diff_fcc( &(jcc->sum),
			   &(call_stack[call_stack_sp-1].fcc), current_fcc) ) {
	    
	    /* only count this call if it attributed some cost.
	     * the ret_counter is used to check if a BBCC dump is needed.
	     */
	    jcc->from->ret_counter++;
	}
	ret_counter++;

	/* restore context */
	current_cxt  = call_stack[call_stack_sp-1].cxt;
	fn_stack_top = fn_stack + call_stack[call_stack_sp-1].fn_sp;
	CT_ASSERT(current_cxt != 0);

	if (depth == 0) {
	    if (to_fn->dump_after) {
		Char trigger[FN_NAME_LEN];
		VG_(sprintf)(trigger, "--dump-after=%s", 
			     to_fn->name);
		cachesim_flush(trigger, True);
	    }
	    if (to_fn->toggle_collect) {
		collect_state = !collect_state;
		CT_DEBUG(2,"   leaving %s: toggled collection state to %s\n",
			 to_fn->name,
			 collect_state ? "ON" : "OFF");
	    }

#if JCC_DEBUG
	    if (to_fn->verbosity >=0) {
		Int old = clo_ct_verbose;
		clo_ct_verbose = to_fn->verbosity;
		to_fn->verbosity = old;
		VG_(message)(Vg_DebugMsg, 
			     "Leaving %s: Verbosity set back to %d",
			     to_fn->name, clo_ct_verbose);
	    }
#endif		
	}
    }

    call_stack_sp--;

    /* To allow for an assertion in push_call_stack() */
    call_stack[call_stack_sp].cxt = 0;

#if JCC_DEBUG
    CT_DEBUGIF(1) {
	if (clo_ct_verbose<4) {
	    if (jcc) {
		/* popped JCC target first */
		VG_(printf)("- %2d %x => ", 
			    call_stack_sp, jcc->to->bb->addr);
		print_addr(jcc->from->bb->jmp_addr);
		VG_(printf)(", ESP %x [%llu/%llu,%llu,%llu]\n",
			    call_stack[call_stack_sp].esp,
			    jcc->call_counter,
			    jcc->sum.Ir.a,
			    jcc->sum.Dr.a,
			    jcc->sum.Dw.a);
	    }
	    else
		VG_(printf)("- %2d [Skipped JCC], ESP %x\n",
			    call_stack_sp, call_stack[call_stack_sp].esp);
	}
	else {
	    VG_(printf)("Popped ");
	    print_stackentry(7, call_stack_sp);
	    if (jcc) {
		VG_(printf)("       returned to ");
		print_addr_ln(jcc->from->bb->jmp_addr);
	    }
	}
    }
#endif

}


/* remove CallStack items to sync with current ESP
 */
static Int unwind_call_stack(Addr esp)
{
    Int old_call_stack_sp = call_stack_sp;

    CT_DEBUG(4,"+unwind_call_stack: esp 0x%x, frame %d\n", 
		esp, call_stack_sp);


    /* We pop old stack frames.
     * For a call, be p the stack address with return address.
     *  - call_stack_esp[] has ESP after the CALL: p-4
     *  - current esp is after a RET: >= p
     */
    while((call_stack_sp>0) &&
	  (call_stack[call_stack_sp-1].esp < esp)) {

	pop_call_stack();
    }

    CT_DEBUG(4,"-unwind_call_stack: diff %d\n", 
		old_call_stack_sp - call_stack_sp);

    return old_call_stack_sp - call_stack_sp;
}


/*------------------------------------------------------------*/
/*--- BBCC operations                                      ---*/
/*------------------------------------------------------------*/

/* All BBCCs are inserted into 2 hash tables.
 * This hash tables are thread specific.
 *  - hash(addr/context => BBCC),
 *  - hash(object name =>
 *         hash(filename => 
 *              hash(fn_name => 
 *                   hash(addr => BBCC))))
 *
 * The first is for fast lookups and needs to be resizable,
 * as we don't know the number of BBs in advance and don't want
 * to waste memory.
 * The second hash is for ordered dumping of CCs at the end.
 * When dumping, the hash keys of the second hash are used.
 *
 */



/* The hash stores BBCCs according to
 * - start address,
 * - current context (involves caller chain with current function)
 *
 * Only BBCCs for recursion level 1 are stored in the table
 * (other BBCCs are reachable via rec_array pointer)
 */
static
UInt hash_bbcc(Addr addr, Context* cxt, UInt size)
{
   CT_ASSERT(addr != 0);
   CT_ASSERT(cxt != 0);

   return (addr + (Addr)cxt) % size;
}
 

/* Lookup for a BBCC in hash1.
 * If prev != 0, it's set to the address of the previous next-pointer in
 * the hash table chain. This is needed for possible deletion.
 */ 
static __inline__
BBCC* lookup_bbcc(BB* bb, BBCC*** prev, Context* cxt)
{
   BBCC* bbcc = bb->last_bbcc;
   UInt  hash;

   /* check LRU */
   if (bbcc->cxt == cxt) {
       if (!clo_dump_threads) {
	   /* if we don't dump threads separate, tid doesn't have to match */
	   return bbcc;
       }
       if (bbcc->tid == current_tid) return bbcc;
   }

   bbcc_lru_misses++;

   hash = hash_bbcc(bb->addr, cxt, bbcc_table_size);
   if (prev) *prev = &(bbcc_table[hash]);
   bbcc = bbcc_table[hash];
   while (bbcc &&
	  (bb      != bbcc->bb ||
	   cxt     != bbcc->cxt)) {
       if (prev) *prev = &(bbcc->next1);
       bbcc = bbcc->next1;
   }
   
   CT_DEBUG(2,"  lookup_bbcc(BB 0x%x, Cxt %d, fn '%s'): 0x%p (tid %d)\n",
	       bb->addr, cxt->base_number, cxt->fn[0]->name, 
	       bbcc, bbcc ? bbcc->tid : 0);

   CT_DEBUGIF(2)
       if (bbcc) print_bbcc(-2,bbcc);

   return bbcc;
}


/* double size of hash table 1 (addr->BBCC) */
static void resize_bbcc_hash()
{
    Int i, new_size, conflicts1 = 0, conflicts2 = 0;
    BBCC** new_table;
    UInt new_hash;
    BBCC *curr_BBCC, *next_BBCC;

    new_size = 2*bbcc_table_size+3;
    new_table = (BBCC**) VG_(malloc)(new_size * sizeof(BBCC*));
 
    if (!new_table) return;
 
    for (i = 0; i < new_size; i++)
      new_table[i] = NULL;
 
    for (i = 0; i < bbcc_table_size; i++) {
	if (bbcc_table[i] == NULL) continue;
 
	curr_BBCC = bbcc_table[i];
	while (NULL != curr_BBCC) {
	    next_BBCC = curr_BBCC->next1;

	    new_hash = hash_bbcc(curr_BBCC->bb->addr,
				 curr_BBCC->cxt,
				 new_size);

	    curr_BBCC->next1 = new_table[new_hash];
	    new_table[new_hash] = curr_BBCC;
	    if (curr_BBCC->next1) {
		conflicts1++;
		if (curr_BBCC->next1->next1)
		    conflicts2++;
	    }

	    curr_BBCC = next_BBCC;
	}
    }

    VG_(free)(bbcc_table);


    CT_DEBUG(0,"Resize BBCC Hash: %d => %d (entries %d, conflicts %d/%d)\n",
	     bbcc_table_size, new_size,
	     bbcc_table_entries, conflicts1, conflicts2);

    bbcc_table_size = new_size;
    bbcc_table = new_table;
    bbcc_table_resizes++;
}



/* Forward declaration. */
static Int compute_BBCC_array_size(UCodeBlock* cb);
 

/* TODO: allocate together with BBCC */

static __inline
BBCC** new_recursion(int size)
{
    BBCC** bbccs;
    int i;
    
    bbccs = (BBCC**) VG_(malloc)(sizeof(BBCC*) * size);
    for(i=0;i<size;i++)
	bbccs[i] = 0;

    CT_DEBUG(3,"  new_recursion(size %d): 0x%p\n", size, bbccs);
    
    return bbccs;
}
  

/*
 * Allocate a new BBCC
 *
 * Uninitialized:
 * cxt, rec_index, rec_array, next_bbcc, next1, next2
 */
static __inline__ 
BBCC* new_bbcc(BB* bb, Bool allocate_cost_array)
{
   BBCC* new;
   Int size = sizeof(BBCC);
   if (allocate_cost_array) size += bb->array_size;

   new = (BBCC*)VG_(malloc)(size);
   new->bb  = bb;
   new->tid = current_tid;

   new->exe_counter = 0;
   new->ret_counter = 0;
   new->skipped = 0;

   if (allocate_cost_array)
       new->array = (Addr)(new + 1); /* directly behind BBCC struct */
   else
       new->array = 0;

   /* Init pointer caches (LRU) */
   new->lru_next_bbcc = 0;
   new->lru_from_jcc  = 0;
   new->lru_to_jcc  = 0;
   

   new->jcc_list  = 0;

   CT_DEBUG(3, "  new_bbcc(BB 0x%x, size %d): 0x%p\n", 
	       bb->addr, bb->array_size, new);

   distinct_bbccs++;

   return new;
}


/**
 * Inserts a new BBCC into hashes.
 * BBCC specific items must be set as this is used for the hash
 * keys:
 *  fn     : current function
 *  tid    : current thread ID
 *  from   : position where current function is called from
 *
 * Recursion level doesn't need to be set as this is not included
 * in the hash key: Only BBCCs with rec level 0 are in hashes.
 */
static
void insert_bbcc_into_hash(BBCC* bbcc)
{
    UInt hash1, hash2;
    fn_info* info;
    
    CT_ASSERT(bbcc->cxt != 0);

    CT_DEBUG(3,"+ insert_bbcc_into_hash(BB 0x%x, fn '%s')\n",
		bbcc->bb->addr, bbcc->cxt->fn[0]->name);

    /* check fill degree of hash1 and resize if needed (>90%) */
    bbcc_table_entries++;
    if (100 * bbcc_table_entries / bbcc_table_size > 90)
	resize_bbcc_hash();

    hash1 = hash_bbcc(bbcc->bb->addr, bbcc->cxt, bbcc_table_size);
    hash2 = bbcc->bb->addr % N_BBCC2_ENTRIES;
    
    info = get_fn_info(bbcc->cxt->fn[0]);
    bbcc->next1 = bbcc_table[hash1];
    bbcc->next2 = info->bbccs[hash2];
    bbcc_table[hash1] = bbcc;
    info->bbccs[hash2] = bbcc;

    CT_DEBUG(3,"- insert_bbcc_into_hash: %d entries\n", bbcc_table_entries);
}

 


/* Create a new BBCC as a copy of an existing one,
 * but with costs set to 0 and jcc chains empty.
 *
 * This is needed when a BB is executed in another context than
 * the one at instrumentation time of the BB.
 *
 * Use cases:
 *  rec_index == 0: clone from a BBCC with differing tid/cxt
 *                  and insert into hashes
 *  rec_index >0  : clone from a BBCC with same tid/cxt and rec_index 0
 *                  don't insert into hashes
 */
static BBCC* clone_bbcc(BBCC* orig, Context* cxt, Int rec_index)
{
    BBCC*      new;
    Int        array_size;
    UChar      tag;
    Addr       orig_ptr0, orig_ptr, new_ptr;

    CT_DEBUG(3,"+ clone_bbcc(BB 0x%x, rec %d, fn %s)\n",
		orig->bb->addr, rec_index, cxt->fn[0]->name);

    new  = new_bbcc(orig->bb, clo_simulate_cache);

    if (rec_index == 0) {

	/* hash insertion is only allowed if tid or cxt is different */
	CT_ASSERT((orig->tid != current_tid) ||
		  (orig->cxt != cxt));

	new->rec_index = 0;
	new->cxt = cxt;
	new->rec_array = new_recursion(cxt->fn[0]->fn_recursion);
	new->rec_array[0] = new;

	insert_bbcc_into_hash(new);
    }
    else {
	if (clo_dump_threads)
	    CT_ASSERT(orig->tid == current_tid);

	CT_ASSERT(orig->cxt == cxt);
	CT_ASSERT(orig->rec_array);
	CT_ASSERT(cxt->fn[0]->fn_recursion > rec_index);
	CT_ASSERT(orig->rec_array[rec_index] ==0);

	/* new BBCC will only have differing recursion level */
	new->rec_index = rec_index;
	new->cxt = cxt;
	new->rec_array = orig->rec_array;
	new->rec_array[rec_index] = new;
    }

    /* update list of BBCCs for same BB */
    new->next_bbcc = orig->bb->bbcc_list;
    orig->bb->bbcc_list = new;


    CT_DEBUGIF(3)
	print_bbcc(-2, new);

    if (!clo_simulate_cache) {
	sk_assert(new->array == 0);
	new->array = orig->array;
    }
    else {
	/* Copy CCs and zero out costs */   
	array_size = orig->bb->array_size;
	
	orig_ptr  = (Addr)(orig->array);
	new_ptr   = (Addr)(new->array);
	orig_ptr0 = orig_ptr;
	while (orig_ptr - orig_ptr0 < array_size) {
	    
	    tag = ((iCC*)orig_ptr)->tag;
	    ((iCC*)new_ptr)->tag = tag;
	    
	    switch ( tag ) {
	    case InstrCC:
		{
		    iCC* orig_cc = ((iCC*)orig_ptr);
		    iCC* new_cc  = ((iCC*)new_ptr);
		    
		    new_cc->instr_size = orig_cc->instr_size;
		    new_cc->instr_addr = orig_cc->instr_addr;
		    
		    init_cc( &(new_cc->I) );
		}
		orig_ptr += sizeof(iCC);
		new_ptr += sizeof(iCC);
		break;
		
	    case ReadCC:
	    case WriteCC:
	    case ModCC:
		{
		    idCC* orig_cc = ((idCC*)orig_ptr);
		    idCC* new_cc  = ((idCC*)new_ptr);
		    
		    new_cc->instr_size = orig_cc->instr_size;
		    new_cc->data_size  = orig_cc->data_size;
		    new_cc->instr_addr = orig_cc->instr_addr;
		    
		    init_cc( &(new_cc->I) );
		    init_cc( &(new_cc->D) );
		}
		orig_ptr += sizeof(idCC);
		new_ptr += sizeof(idCC);
		break;
		
	    case ReadWriteCC:
		{
		    iddCC* orig_cc = ((iddCC*)orig_ptr);
		    iddCC* new_cc  = ((iddCC*)new_ptr);
		    
		    new_cc->instr_size = orig_cc->instr_size;
		    new_cc->data_size  = orig_cc->data_size;
		    new_cc->instr_addr = orig_cc->instr_addr;
		    
		    init_cc( &(new_cc->I) );
		    init_cc( &(new_cc->Da) );
		    init_cc( &(new_cc->Db) );
		}
		orig_ptr += sizeof(iddCC);
		new_ptr += sizeof(iddCC);
		break;
		
	    default:
		VG_(skin_panic)("Unknown CC type in clone_BBCC()\n");
		break;
	    }
	}
    }
 
    CT_DEBUG(2,"- clone_BBCC(0x%p, %d) for BB 0x%x\n"
		"   orig %s\n"
		"   new  %s\n",
		orig, rec_index, orig->bb->addr,
		mangled_cxt(orig->cxt, orig->rec_index),
		mangled_cxt(new->cxt, new->rec_index));

    bbcc_clones++;
 
    return new;
};



/* Get a pointer to the cost centre structure for given basic block
 * address. If created, the BBCC is inserted into the two hashes,
 * creating new hash nodes when needed.
 * If <remove> is true, remove BBCC from hash chains, but don't free.
 * Also sets BB_seen_before by reference.
 *
 */ 
static
BBCC* get_bbcc(Addr bb_orig_addr, UCodeBlock* cb, 
	       Bool remove, Bool *BB_seen_before)
{
   BB*   bb;
   BBCC* bbcc;

   VGP_PUSHCC(VgpCacheGetBBCC);

   CT_DEBUG(3, "+ get_bbcc(BB 0x%x)\n", bb_orig_addr);

   /* TODO: optimisation: allocate BB+BBCC+BBRecursion at once */

   bb = get_bb(bb_orig_addr);
   bbcc = bb->bbcc_list;

   if (bbcc) {
      CT_ASSERT(bb_orig_addr == bb->addr);
      // array_size is 0 for artifical BBs (without translation)
      CT_ASSERT(bb->array_size >= 0 && bb->array_size < 1000000);
      if (VG_(clo_verbosity) > 2) {
          VG_(message)(Vg_DebugMsg, 
            "BB retranslation, retrieving from BB table");
      }
      *BB_seen_before = True;

      if (True == remove) {

	  /* this marks the BB as freeable.
	   * We can't free it here as jccs can have references to it.
	   * it should be free'd by the caller.
	   * FIXME: Use refcounts for jccs pointing to it...
	   */
	  /* not good for hash resizing :-(
	    bb->addr = 0;
	  */

      } else {
          BB_retranslations++;
      }
    }
   else {
       CT_ASSERT(False == remove);
       *BB_seen_before = False;

       bb->array_size = cb ? compute_BBCC_array_size(cb) : 0;       
       bbcc = new_bbcc(bb, True);
       sk_assert(bbcc->array != 0);

       /* initialize BBCC */
       bbcc->cxt       = 0;
       bbcc->rec_array = 0;
       bbcc->rec_index = 0;

       bbcc->next_bbcc = bb->bbcc_list;
       bb->bbcc_list = bbcc;
       bb->last_bbcc = bbcc;

       CT_DEBUGIF(3)
	   print_bbcc(-2, bbcc);

   }

   CT_DEBUG(3, "- get_bbcc(BB 0x%x): BBCC 0x%p\n",
		bb_orig_addr, bbcc);

   VGP_POPCC(VgpCacheGetBBCC);
   return bbcc;
}




/*------------------------------------------------------------*/
/*--- Support for signal handlers and multi-threading      ---*/
/*------------------------------------------------------------*/


/*
 * For Valgrind, MT is cooperative (no preemting in our code),
 * so we don't need locks...
 *
 * Per-thread data:
 *  - BBCCs
 *  - BBCC hash2
 *  - call stack
 *  - call hash
 *  - event counters: last, current, discards
 *
 * Commonly used data:
 *  - BBCC hash1 (entries are keyed by addr + tid)
 *
 * Even when not supporting MT, we need this functions to set up some
 * datastructures for the process (= Thread 1).
 */


static thread_info* thread[VG_N_THREADS];

static void init_threads()
{
    Int i;
    for(i=0;i<VG_N_THREADS;i++)
	thread[i] = 0;
    current_tid = VG_INVALID_THREADID;
}

/* Each thread can be interrupted by a signal handler, and they
 * themselve again. But as there's no scheduling among handlers
 * of the same thread, we don't need additional stacks.
 * So storing execution contexts and
 * adding separators in the callstack(needed to not intermix normal/handler
 * functions in contexts) should be enough.
 */

static thread_cxtinfo* new_cxtinfo(Int sigNum)
{
    thread_cxtinfo* ci;
    ci = (thread_cxtinfo*) VG_(malloc)(sizeof(thread_cxtinfo));

    ci->sigNum = sigNum;
    ci->collect_state = clo_collect_state;
    init_fcc( &(ci->current) );
    ci->current_cxt        = 0;
    ci->bbcc_jmpkind       = -1;
    ci->current_bbcc       = 0;
    ci->current_nonskipped = 0;
    ci->call_stack_bottom  = 0;

    return ci;
}

/* Get top context info struct of current thread */
static thread_cxtinfo* top_cxtinfo()
{
  Int cxt_sp = thread[current_tid]->cxt_stackpointer;
  thread_cxtinfo* ci;

  CT_ASSERT((cxt_sp >= 0) && (cxt_sp < MAX_SIGHANDLERS));
  ci = thread[current_tid]->cxt_stack[cxt_sp];
  CT_ASSERT(ci != 0);
  return ci;
}

/* Allocates a free context info structure for a new entered
 * signal handler, putting it on the context stack.
 * Returns a pointer to the structure.
 */
static thread_cxtinfo* push_cxtinfo(int sigNum)
{   
  Int cxt_sp;
  thread_cxtinfo* ci;

  thread[current_tid]->cxt_stackpointer ++;
  cxt_sp = thread[current_tid]->cxt_stackpointer;

  CT_ASSERT((sigNum > 0) && (sigNum <= VKI_KNSIG));
  CT_ASSERT((cxt_sp > 0) && (cxt_sp < MAX_SIGHANDLERS));
  ci = thread[current_tid]->cxt_stack[cxt_sp];
  if (!ci) {
    ci = new_cxtinfo(sigNum);
    thread[current_tid]->cxt_stack[cxt_sp] = ci;
  }
  else
    ci->sigNum = sigNum;

  return ci;
}

static thread_info* new_thread()
{
    Int i;
    thread_info* t;

    t = VG_(malloc)(sizeof(thread_info));

    /* additional stack of contexts for signal handlers.
     * The first element is for the main thread */
    t->cxt_stack[0] = new_cxtinfo(0);
    for(i=1;i<MAX_SIGHANDLERS;i++)
      t->cxt_stack[i] = 0;
    t->cxt_stackpointer = 0;
    init_fcc( &(t->handler_sum) );

    /* call stack */
    t->stack_size = N_STACK_INITIAL_ENTRIES;   
    t->stack = (call_entry*) VG_(malloc)(t->stack_size * sizeof(call_entry));
    t->stack_sp = 0;
    t->stack[0].cxt = 0; /* for assertion in push_cxt() */

    /* function context stack */
    t->fn_stack_size = N_FNSTACK_INITIAL_ENTRIES;   
    t->fn_stack = (fn_node**) VG_(malloc)(t->fn_stack_size * sizeof(fn_node*));
    t->fn_stack_top = t->fn_stack;
    t->fn_stack[0] = 0;
    t->cxt_stack[0]->current_cxt = get_cxt(t->fn_stack);

    /* event counters */
    init_fcc( &(t->last) );
    init_fcc( &(t->discards) );

    /* function active counter array */
    t->fn_active_array_size = N_FNINFO_INITIAL_ENTRIES;
    if (t->fn_active_array_size <= distinct_fns)
	t->fn_active_array_size = distinct_fns+1;
    t->fn_active_array = (UInt*) VG_(malloc)(t->fn_active_array_size *
					       sizeof(UInt));
    for(i=0;i<t->fn_active_array_size;i++)
	t->fn_active_array[i] = 0;

    /* function info table */
    t->fn_info_table_size = N_FNINFO_INITIAL_ENTRIES;
    if (t->fn_info_table_size <= distinct_fns)
	t->fn_info_table_size = distinct_fns+1;
    t->fn_info_table = (fn_info**) VG_(malloc)(t->fn_info_table_size *
					       sizeof(fn_info*));
    for(i=0;i<t->fn_info_table_size;i++)
	t->fn_info_table[i] = 0;

    /* jcc hash */
    t->jcc_table_size    = N_JCC_INITIAL_ENTRIES;
    t->jcc_table_entries = 0;
    t->jcc_spontaneous    = 0;
    t->jcc_table = (jCC**) VG_(malloc)(t->jcc_table_size * sizeof(jCC*));
    for (i = 0; i < t->jcc_table_size; i++)
	t->jcc_table[i] = 0;
    
    t->bbcc_table_size    = N_BBCC1_INITIAL_ENTRIES;
    t->bbcc_table_entries = 0;
    t->bbcc_table = (BBCC**) VG_(malloc)(t->bbcc_table_size * sizeof(BBCC*));
    for (i = 0; i < t->bbcc_table_size; i++)
	t->bbcc_table[i] = 0;
    
    return t;
}

/* Save current context to top cxtinfo struct */
static thread_cxtinfo* cxtinfo_save()
{
  thread_cxtinfo* ci = top_cxtinfo();

  CT_DEBUGIF(1) {
    CT_DEBUG(1, "  cxtinfo_save(sig %d): collect %s, jmpKind %d\n",
	     ci->sigNum, collect_state ? "Yes": "No", bbcc_jmpkind);	
	print_bbcc(-9, current_bbcc);
	print_fcc(-9, current_fcc);	
    }

  ci->current_cxt = current_cxt;
  ci->collect_state = collect_state;
  ci->bbcc_jmpkind = bbcc_jmpkind;
  ci->current_bbcc = current_bbcc;
  ci->current_nonskipped = current_nonskipped;

  /* signal number does not need to be saved */
  CT_ASSERT(current_sigNum == ci->sigNum);

  return ci;
}

static thread_cxtinfo* cxtinfo_restore()
{
  thread_cxtinfo* ci = top_cxtinfo();

  current_cxt   = ci->current_cxt;
  collect_state = ci->collect_state;
  bbcc_jmpkind  = ci->bbcc_jmpkind;
  current_bbcc  = ci->current_bbcc;
  current_nonskipped = ci->current_nonskipped;
  current_fcc       = &(ci->current);
  current_sigNum = ci->sigNum;
    
  CT_DEBUGIF(1) {
	CT_DEBUG(1, "  cxtinfo_restore(sig %d): collect %s, jmpKind %d\n",
		 ci->sigNum, collect_state ? "Yes": "No", bbcc_jmpkind);
	print_bbcc(-9, current_bbcc);
	print_fcc(-9, current_fcc);
  }

  return ci;
}


static void switch_thread(ThreadId tid)
{
    if (tid == current_tid) return;

    CT_DEBUG(1, "switch_thread: %d -> %d\n", current_tid, tid);

    if (current_tid != VG_INVALID_THREADID) {    
	/* save thread state */

	/* If we cumulate costs of threads, we use TID 1 for some structures */
	int shared_tid = clo_dump_threads ? current_tid : 1;

	CT_ASSERT(thread[current_tid] != 0);

	/* call_stack */
	thread[current_tid]->stack_size = call_stack_size;
	thread[current_tid]->stack      = call_stack;
	thread[current_tid]->stack_sp   = call_stack_sp;
	
	/* context stack */
	thread[current_tid]->fn_stack_size = fn_stack_size;
	thread[current_tid]->fn_stack      = fn_stack;
	thread[current_tid]->fn_stack_top  = fn_stack_top;

	/* current context (including signal handler contexts) */
	cxtinfo_save();

	/* tables/arrays */
	thread[current_tid]->fn_active_array = fn_active_array;
	thread[current_tid]->fn_active_array_size = fn_active_array_size;

	thread[shared_tid]->fn_info_table = fn_info_table;
	thread[shared_tid]->fn_info_table_size = fn_info_table_size;

	thread[shared_tid]->jcc_table = jcc_table;
	thread[shared_tid]->jcc_table_size = jcc_table_size;
	thread[shared_tid]->jcc_table_entries = jcc_table_entries;
	thread[shared_tid]->jcc_spontaneous = jcc_spontaneous;

	thread[shared_tid]->bbcc_table_size = bbcc_table_size;
	thread[shared_tid]->bbcc_table_entries = bbcc_table_entries;
	thread[shared_tid]->bbcc_table = bbcc_table;

	/* to detect bugs... */
	current_fcc  = 0;
	discards_fcc = 0;
	call_stack   = 0;
	fn_stack     = 0;
	fn_info_table = 0;
	jcc_table    = 0;
	bbcc_table   = 0;
    }

    current_tid = tid;
    CT_ASSERT(tid < VG_N_THREADS);

    if (tid != VG_INVALID_THREADID) {

	/* If we cumulate costs of threads, we use TID 1 for some structures */
	int shared_tid = clo_dump_threads ? current_tid : 1;

	/* load thread state */

	if (thread[tid] == 0) thread[tid] = new_thread();

	/* current context (including signal handler contexts) */
	cxtinfo_restore();

	call_stack_size  = thread[tid]->stack_size;
	call_stack       = thread[tid]->stack;
	call_stack_sp    = thread[tid]->stack_sp;

	fn_stack_size  = thread[tid]->fn_stack_size;
	fn_stack       = thread[tid]->fn_stack;
	fn_stack_top   = thread[tid]->fn_stack_top;

	discards_fcc      = &(thread[tid]->discards);

	fn_active_array      = thread[tid]->fn_active_array;
	fn_active_array_size = thread[tid]->fn_active_array_size;
	if (fn_active_array_size <= distinct_fns)
	    resize_fn_active_array();

	/* BBCC/JCC tables */
	fn_info_table      = thread[shared_tid]->fn_info_table;
	fn_info_table_size = thread[shared_tid]->fn_info_table_size;
	if (fn_info_table_size <= distinct_fns)
	    resize_fn_info_table();

	bbcc_table_size    = thread[shared_tid]->bbcc_table_size;
	bbcc_table_entries = thread[shared_tid]->bbcc_table_entries;
	bbcc_table         = thread[shared_tid]->bbcc_table;

	jcc_table          = thread[shared_tid]->jcc_table;
	jcc_table_size     = thread[shared_tid]->jcc_table_size;
	jcc_table_entries  = thread[shared_tid]->jcc_table_entries;
	jcc_spontaneous    = thread[shared_tid]->jcc_spontaneous;
    }
}

static Int createRes(Int fd)
{
    if (fd > -2) return fd;

    /* fd == -2: No error, but we need to create the file */
    fd = VG_(open)(result_file,
		   VKI_O_CREAT|VKI_O_WRONLY|VKI_O_TRUNC,
		   VKI_S_IRUSR|VKI_S_IWUSR);

    /* VG_(open) can return any negative number on error. Remap errors to -1,
     * to not confuse it with our special value -2
     */
    if (fd<0) fd = -1;

    return fd;
}

/* Temporary output buffer for
 *  print_fn_pos, fprint_apos, fprint_fcost, fprint_jcc,
 *  fprint_fcc_ln, dump_run_info, dump_state_info
 */
static Char outbuf[FILENAME_LEN + FN_NAME_LEN + OBJ_NAME_LEN];


/* Run Info: Fixed information for a callgrind run */
static Int dump_info(Int fd)
{
    Char* buf = outbuf;
    int i;
    
    if ( (fd = createRes(fd)) <0) return fd;

    /* version */
    VG_(sprintf)(buf, "version: " VERSION "\n");
    VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
    
    /* "pid:" line */
    VG_(sprintf)(buf, "pid: %d\n", VG_(getpid)());
    VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
    
    /* "base:" line */
    VG_(sprintf)(buf, "base: %s\n", dump_file_base);
    VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
    
    /* "cmd:" line */
    VG_(strcpy)(buf, "cmd:");
    VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
    for (i = 0; i < VG_(client_argc); i++) {
	VG_(sprintf)(buf, " %s", VG_(client_argv[i]));
	VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
    }

    return fd;
}

/* Dump info on current callgrind state */
static Int dump_state(Int fd)
{
    Char* buf = outbuf;

    int t, p, i;
    int orig_tid = current_tid;
    fCC sum, tmp;
    BBCC *from, *to;

    if ( (fd = createRes(fd)) <0) return fd;

    VG_(sprintf)(buf, "executed-bbs: %llu\n", bb_executions);
    VG_(write)(fd, (void*)buf, VG_(strlen)(buf));

    VG_(sprintf)(buf, "executed-calls: %llu\n", call_counter);
    VG_(write)(fd, (void*)buf, VG_(strlen)(buf));

    VG_(sprintf)(buf, "distinct-bbs: %d\n", distinct_bbs);
    VG_(write)(fd, (void*)buf, VG_(strlen)(buf));

    VG_(sprintf)(buf, "distinct-calls: %d\n", jcc_table_entries);
    VG_(write)(fd, (void*)buf, VG_(strlen)(buf));

    VG_(sprintf)(buf, "distinct-functions: %d\n", distinct_fns);
    VG_(write)(fd, (void*)buf, VG_(strlen)(buf));

    VG_(sprintf)(buf, "distinct-contexts: %d\n", distinct_contexts);
    VG_(write)(fd, (void*)buf, VG_(strlen)(buf));

    /* "events:" line. Given here because it will be dynamic in the future */
    if (!clo_simulate_cache)
	VG_(sprintf)(buf, "events: Ir\n");
    else
	VG_(sprintf)(buf, "events: Ir Dr Dw I1mr "
		     "D1mr D1mw I2mr D2mr D2mw\n");
    VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
		
    /* "part:" line (number of last part. Is 0 at start */
    VG_(sprintf)(buf, "\npart: %d\n", out_counter);
    VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
		
    /* threads */
    p = VG_(sprintf)(buf, "threads:");
    for(t=1;t<VG_N_THREADS;t++) {
	if (!thread[t]) continue;
	p += VG_(sprintf)(buf+p, " %d", t);
    }
    p += VG_(sprintf)(buf+p, "\n");
    VG_(write)(fd, (void*)buf, p);

    VG_(sprintf)(buf, "current-tid: %d\n", orig_tid);
    VG_(write)(fd, (void*)buf, VG_(strlen)(buf));

    /* current event counters */
    for(t=1;t<VG_N_THREADS;t++) {
	if (!thread[t]) continue;
	switch_thread(t);
	    
	p = VG_(sprintf)(buf, "events-%d: ",t);
	init_fcc( &sum );
	copy_fcc( &tmp, &(thread[t]->last) );
	add_diff_fcc(&sum,
		     &(thread[t]->last),
		     &(thread[t]->cxt_stack[0]->current));
	copy_fcc( &(thread[t]->last), &tmp );
	p += sprint_fcc(buf + p, &sum);
	p += VG_(sprintf)(buf+p, "\n");
	VG_(write)(fd, (void*)buf, p);

	p = VG_(sprintf)(buf, "frames-%d: %d\n",t, call_stack_sp);
	VG_(write)(fd, (void*)buf, p);
	for(i = 0; i < call_stack_sp; i++) {
	    /* if this frame is skipped, we don't have counters */
	    if (!call_stack[i].jcc) continue;

	    from = call_stack[i].jcc->from;
	    p = VG_(sprintf)(buf, "function-%d-%d: %s\n",t, i, 
			     from->cxt->fn[0]->name);	    
	    VG_(write)(fd, (void*)buf, p);

	    p = VG_(sprintf)(buf, "calls-%d-%d: ",t, i);
	    p+= VG_(sprintf)(buf+p, "%llu\n", 
			     call_stack[i].jcc->call_counter);
	    VG_(write)(fd, (void*)buf, p);

	    copy_fcc( &sum, &(call_stack[i].jcc->sum) );
	    copy_fcc( &tmp, &(call_stack[i].fcc) );
	    add_diff_fcc( &sum,
			  &(call_stack[i].fcc), current_fcc );
	    copy_fcc( &(call_stack[i].fcc), &tmp );

	    p = VG_(sprintf)(buf, "events-%d-%d: ",t, i);
	    p += sprint_fcc(buf + p, &sum );
	    p += VG_(sprintf)(buf+p, "\n");
	    VG_(write)(fd, (void*)buf, p);
	}
	if (call_stack[i-1].jcc) {
	    to = call_stack[i-1].jcc->to;
	    p = VG_(sprintf)(buf, "function-%d-%d: %s\n",t, i, 
			     to->cxt->fn[0]->name );	    
	    VG_(write)(fd, (void*)buf, p);
	}

    }
    if (current_tid != orig_tid) switch_thread(orig_tid);

    return fd;
}

/* forward decl */
static void finish();

static void thread_run(ThreadId tid)
{
    /* check for dumps needed */
    static ULong last_bbs_done = 0;
    static Char buf[512];
    static Char cmdBuffer[512];
    Char *cmdPos = 0, *cmdNextLine = 0;
    Int fd, bytesRead = 0, do_kill = 0;

    /* toggle between 2 command files, with/without ".pid" postfix */
    static Char* cfile = 0;
    cfile = ((cfile == command_file) || (cfile == 0)) ? 
      command_file2 : command_file;

    if (clo_dumps >0) {
       if (VG_(bbs_done) - last_bbs_done > clo_dumps) {
           VG_(sprintf)(buf, "--dumps=%d", clo_dumps);
	   cachesim_flush(buf, False);
           last_bbs_done = VG_(bbs_done);
       }
    }
    
    fd = VG_(open)(cfile, VKI_O_RDONLY,0);
    if (fd>=0) {
	bytesRead = VG_(read)(fd,cmdBuffer,500);
	cmdBuffer[500] = 0; /* no command overrun please */
	VG_(close)(fd);
	/* don't delete command file on read error (e.g. EAGAIN) */
	if (bytesRead>0) {
	    cmdPos = cmdBuffer;
	}
    }

    /* force creation of result file if needed */
    fd = -2;

    while((bytesRead>0) && *cmdPos) {
      
	/* Calculate pointer for next line */
	cmdNextLine = cmdPos+1;
	while((bytesRead>0) && *cmdNextLine && (*cmdNextLine != '\n')) {
	  cmdNextLine++;
	  bytesRead--;
	}
	if ((bytesRead>0) && (*cmdNextLine == '\n')) {
	  *cmdNextLine = 0;
	  cmdNextLine++;
	  bytesRead--;
	} 

	switch(*cmdPos) {
	case 'D':
	case 'd':
	  /* DUMP */

	  /* skip command */
	  while(*cmdPos && (*cmdPos != ' ')) cmdPos++;
	  if (*cmdPos)
	    VG_(sprintf)(buf, "Dump Command:%s", cmdPos);
	  else
	    VG_(sprintf)(buf, "Dump Command");
	  cachesim_flush(buf, False);
	  break;
	    
	case 'Z':
	case 'z':
	    /* ZERO CCs */
	    zero_ccs(False);
	    break;

	case 'K':
	case 'k':
	    /* Kill: Delay to be able to remove command file before. */
	    do_kill = 1;
	    break;

	case 'I':
	case 'i':
	    fd = dump_info(fd);
	    break;

	case 's':
	case 'S':
	    fd = dump_state(fd);
	    break;

	case 'O':
	case 'o':
	    /* Options Info */
	    if ( (fd = createRes(fd)) <0) break;

	    VG_(sprintf)(buf, "\ndesc: Option: --skip-plt=%s\n",
			 clo_skip_plt ? "yes" : "no");
	    VG_(write)(fd, (void*)buf, VG_(strlen)(buf));	    
	    VG_(sprintf)(buf, "desc: Option: --trace-jump=%s\n",
			 clo_trace_jump ? "yes" : "no");
	    VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
	    VG_(sprintf)(buf, "desc: Option: --fn-recursion=%d\n",
			 clo_fn_recursion);
	    VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
	    VG_(sprintf)(buf, "desc: Option: --fn-caller=%d\n",
			 clo_fn_caller);
	    VG_(write)(fd, (void*)buf, VG_(strlen)(buf));

	    break;

	default:
	  break;
	}

	cmdPos = cmdNextLine;
    }

    /* If command executed, delete command file */
    if (cmdPos) VG_(unlink)(cfile);
    if (fd>=0) VG_(close)(fd);	    

    if (do_kill) {
      VG_(message)(Vg_UserMsg,
		   "Killed because of command from %s", command_file);
      finish();
      VG_(exit)(1);
    }

    /* now check for thread switch */
    switch_thread(tid);
}


static void pre_signal(ThreadId tid, Int sigNum, Bool alt_stack)
{
    thread_cxtinfo *ci;

    CT_DEBUG(0, "pre_signal(TID %d, sig %d, alt_st %s)\n",
	     tid, sigNum, alt_stack ? "yes":"no");

    /* switch to the thread the handler runs in */
    thread_run(tid);

    /* save current context */
    cxtinfo_save();

    /* setup current context for a spontaneous call */
    collect_state = clo_collect_state;
    current_cxt  = 0;
    push_cxt(0);
    bbcc_jmpkind = -1;
    current_bbcc = 0;
    current_nonskipped = 0;    

    /* setup new cxtinfo struct for this signal handler */
    ci = push_cxtinfo(sigNum);
    init_fcc( &(ci->current) );
    current_fcc = &(ci->current);
    ci->call_stack_bottom = call_stack_sp;

    current_sigNum = sigNum;
}

static void post_signal(ThreadId tid, Int sigNum)
{
    thread_cxtinfo *ci;
    Int fn_number;

    CT_DEBUG(0, "post_signal(TID %d, sig %d)\n",
	     tid, sigNum);

    CT_ASSERT(tid == current_tid);
    CT_ASSERT(sigNum == current_sigNum);

    /* Unwind call stack of this signal handler.
     * This should only be needed at finalisation time
     */
    ci = top_cxtinfo();
    while(call_stack_sp > ci->call_stack_bottom)
	pop_call_stack();

    /* correct active counts */
    fn_number = current_cxt->fn[0]->number;
    CT_ASSERT(fn_number < fn_active_array_size);
    CT_ASSERT(fn_active_array[fn_number] == 1);
    fn_active_array[fn_number]--;
    CT_DEBUG(0, "  set active count of %s back to %d\n",
	     current_cxt->fn[0]->name, fn_active_array[fn_number]);

    /* set fn_stack_top back */
    fn_stack_top--;
    CT_ASSERT(*fn_stack_top == 0);
    if (fn_stack_top > fn_stack)
	fn_stack_top--;

    /* sum up costs */
    CT_ASSERT( &(ci->current) == current_fcc );
    add_fcc( &(thread[current_tid]->handler_sum), current_fcc );
    
    /* restore previous context */
    ci->sigNum = -1;
    thread[current_tid]->cxt_stackpointer --;
    ci = top_cxtinfo();
    current_sigNum = ci->sigNum;
    cxtinfo_restore();
}





/*------------------------------------------------------------*/
/*--- Cache simulation instrumentation phase               ---*/
/*------------------------------------------------------------*/

static Int compute_BBCC_array_size(UCodeBlock* cb)
{
   UInstr* u_in;
   Int     i, CC_size, BBCC_size = 0;
   Bool    is_LOAD, is_STORE, is_FPU_R, is_FPU_W;
   Int     t_read, t_write;
    
   is_LOAD = is_STORE = is_FPU_R = is_FPU_W = False;
   t_read = t_write = INVALID_TEMPREG;

   for (i = 0; i < VG_(get_num_instrs)(cb); i++) {
      u_in = VG_(get_instr)(cb, i);
      switch(u_in->opcode) {

         case INCEIP: 
            goto case_for_end_of_instr;
         
         case JMP:
            if (u_in->cond != CondAlways) break;

            goto case_for_end_of_instr;

            case_for_end_of_instr:

            if (((is_LOAD && is_STORE) || (is_FPU_R && is_FPU_W)) && 
                 t_read != t_write)
               CC_size = sizeof(iddCC);
            else if (is_LOAD || is_STORE || is_FPU_R || is_FPU_W)
               CC_size = sizeof(idCC);
            else
               CC_size = sizeof(iCC);

            BBCC_size += CC_size;
            is_LOAD = is_STORE = is_FPU_R = is_FPU_W = False;
            break;

         case LOAD:
            /* Two LDBs are possible for a single instruction */
            /* Also, a STORE can come after a LOAD for bts/btr/btc */
            CT_ASSERT(/*!is_LOAD &&*/ /* !is_STORE && */ 
                      !is_FPU_R && !is_FPU_W);
            t_read = u_in->val1;
            is_LOAD = True;
            break;

         case STORE:
            /* Multiple STOREs are possible for 'pushal' */
            CT_ASSERT(            /*!is_STORE &&*/ !is_FPU_R && !is_FPU_W);
            t_write = u_in->val2;
            is_STORE = True;
            break;


         case MMX2_MemRd:
            CT_ASSERT(u_in->size == 4 || u_in->size == 8);
            /* fall through */

         case FPU_R:
            CT_ASSERT(!is_LOAD && !is_STORE && !is_FPU_R && !is_FPU_W);
            t_read = u_in->val2;
            is_FPU_R = True;
            break;

      case SSE2a_MemRd:
      case SSE2a1_MemRd:
            CT_ASSERT(u_in->size == 4 || u_in->size == 8 || u_in->size == 16 || u_in->size == 512);
	    t_read = u_in->val3;
	    is_FPU_R = True;
	    break;

      case SSE3a_MemRd:
            CT_ASSERT(u_in->size == 4 || u_in->size == 8 || u_in->size == 16);
	    t_read = u_in->val3;
	    is_FPU_R = True;
	    break;

#if VG_CORE_INTERFACE_MAJOR_VERSION > 3
	    /* Supported since VG-20031104 */
      case SSE3a1_MemRd:
            CT_ASSERT(u_in->size == 8 || u_in->size == 16);
	    t_read = u_in->val3;
	    is_FPU_R = True;
	    break;
#endif

      case SSE3ag_MemRd_RegWr:
            CT_ASSERT(u_in->size == 4 || u_in->size == 8);
	    t_read = u_in->val1;
	    is_FPU_R = True;
	    break;

      case MMX2_MemWr:
	    CT_ASSERT(u_in->size == 4 || u_in->size == 8);
            /* fall through */

      case FPU_W:
            CT_ASSERT(!is_LOAD && !is_STORE && !is_FPU_R && !is_FPU_W);
            t_write = u_in->val2;
            is_FPU_W = True;
            break;

      case SSE2a_MemWr:
            CT_ASSERT(u_in->size == 4 || u_in->size == 8 || u_in->size == 16 || u_in->size == 512);
	    t_write = u_in->val3;
	    is_FPU_W = True;
            break;

      case SSE3a_MemWr:
            CT_ASSERT(u_in->size == 4 || u_in->size == 8 || u_in->size == 16);
	    t_write = u_in->val3;
	    is_FPU_W = True;
            break;

         default:
            break;
      }
   }

   return BBCC_size;
}



/* for _libc_freeres_wrapper => _exit renaming */
static BB* exit_bb = 0;


/*
 * Create function struct for a BB from debug info.
 */
static
fn_node* new_bb_fn(BB* bb)
{
    Char       filename[FILENAME_LEN], fnname[FN_NAME_LEN];
    SegInfo*   si;
    Int        line_num;
    fn_node*   fn;

    /* fn from debug info is idempotent for a BB */
    if (bb->fn) return bb->fn;

    CT_DEBUG(3,"+ new_bb_fn(BB 0x%x)\n", bb->addr);

    get_debug_info(bb->addr, filename, fnname, &line_num, &si);

    if (0 == VG_(strcmp)(fnname, "???")) {
	/* Use address as found in library */
	VG_(sprintf)(fnname, "0x%08x%s", 
		     bb->addr - bb->obj->offset,
		     (bb->sect_kind == Vg_SectData) ? " [Data]" :
		     (bb->sect_kind == Vg_SectBSS)  ? " [BSS]"  :
		     (bb->sect_kind == Vg_SectGOT)  ? " [GOT]"  :
		     (bb->sect_kind == Vg_SectPLT)  ? " [PLT]"  : "");
    }
    else {
      if (VG_(get_fnname_if_entry)(bb->addr, fnname, FN_NAME_LEN))
	bb->is_entry = 1;
    }

    /* HACK for correct _exit: 
     * _exit is redirected to VG_(__libc_freeres_wrapper) by valgrind,
     * so we rename it back again :-)
     */
    if (0 == VG_(strcmp)(fnname, "vgPlain___libc_freeres_wrapper")
	&& exit_bb) {
	get_debug_info(exit_bb->addr, filename, fnname, &line_num, &si);
	
	CT_DEBUG(1, "__libc_freeres_wrapper renamed to _exit\n");
    }
    if (0 == VG_(strcmp)(fnname, "_exit") && !exit_bb)
	exit_bb = bb;
    

    CT_DEBUG(3,"- new_bb_fn: %s\n", fnname);

    fn = get_fn_node2( si, filename, fnname);

    if (bb->sect_kind == Vg_SectPLT)	
	fn->skip = clo_skip_plt;
    
    update_fn_config(fn);

    /* Every function gets a "pure" context, i.e. a context with stack
     * depth 1 only with this function. This is for compression of mangled
     * names
     */
    if (!fn->pure_cxt) {
	fn_node* pure[2];
	pure[0] = 0;
	pure[1] = fn;
	fn->pure_cxt = get_cxt(pure+1);
    }

    bb->fn   = fn;
    bb->line = line_num;

    return fn;
}


static __attribute__ ((regparm (1)))
void setup_bbcc(BB* bb)
{
    BBCC* bbcc;
    Bool  call_emulation = False, delayed_push = False, skip;
    Addr esp;

    VGP_PUSHCC(VgpCacheSetup);

    CT_DEBUG(3,"+ setup_bbcc(BB 0x%x)\n", bb->addr);

    esp = VG_(get_stack_pointer)();

    /* Manipulate JmpKind if needed, only using BB specific info */

    if ((bbcc_jmpkind != JmpRet) && (bbcc_jmpkind != JmpCall)) {

	/* We simulate a JMP to be a CALL if
	 * - current_bbcc != 0, i.e. not first instruction
	 * - jump is in another ELF object or section kind
	 * - jump is to first instruction of a function (tail recursion)
	 */
	if (current_bbcc && 
	    (bb->is_entry ||
	     (current_bbcc->bb->sect_kind != bb->sect_kind) ||
	     (current_bbcc->bb->obj->number != bb->obj->number))) {

	    bbcc_jmpkind = JmpCall;
	    call_emulation = True;

	    CT_DEBUG(1,"     JMP from %s to %s!\n",
			current_bbcc->bb->obj->name,
			bb->obj->name);
	}
    }

    skip = (bbcc_jmpkind == JmpCall) && new_bb_fn(bb)->skip;

    CT_DEBUGIF(1) {
	VG_(printf)("%s %08x -> %08x, ESP %08x\n",
		    (bbcc_jmpkind == JmpNone)  ? "CONT" :
		    (bbcc_jmpkind == JmpCond)  ? "JCND" :
		    (bbcc_jmpkind == JmpRet)   ? "RET " :
		    (bbcc_jmpkind == JmpCall)  ? "CALL" :
		    (bbcc_jmpkind == JmpBoring)? "JMP " :
		    (bbcc_jmpkind == JmpSyscall)? "SYSC" : "CREQ",
		    current_bbcc ? current_bbcc->bb->jmp_addr : 0,
		    bb->addr, esp);
    }

    /* Handle CALL/RET and update context to get correct BBCC */

    if (bbcc_jmpkind == JmpRet) {
	if ((call_stack_sp == 0) || 
	    ((fn_stack_top > fn_stack) && (*(fn_stack_top-1)==0)) ) {
	    /* RET at top of call stack */
	    BBCC* source_bbcc;
	    jCC* jcc;
	    Bool seen_before;
	    fn_node* caller;
	    int fn_number;	    

	    CT_DEBUG(1,"     RET at TOP!\n");

	    /* we emulate an old call from the function we return to
	     * by using (<return address> -1) */
	    source_bbcc = get_bbcc(bb->addr-1, 0,
				   False, &seen_before);

	    /* seen_before can be true if RET from a signal handler */
	    if (!seen_before) {
		source_bbcc->bb->jmp_addr = source_bbcc->bb->addr;
		source_bbcc->bb->size =  1;
		source_bbcc->exe_counter = collect_state ? 1 : 0;
	    }
	    else if (collect_state)
		source_bbcc->exe_counter++;
		
	    /* Force a new top context, will be set active by push_cxt() */
	    fn_stack_top--;
	    current_cxt = 0;
	    caller = new_bb_fn(source_bbcc->bb);
	    push_cxt( caller );

	    if (!seen_before) {
		/* set rec array for source BBCC: this is at rec level 1 */
		source_bbcc->rec_array = new_recursion(caller->fn_recursion);
		source_bbcc->rec_array[0] = source_bbcc;

		CT_ASSERT(source_bbcc->cxt == 0);
		source_bbcc->cxt = current_cxt;
		insert_bbcc_into_hash(source_bbcc);
	    }
	    CT_ASSERT(current_bbcc);

	    /* correct active counts */
	    fn_number = current_bbcc->cxt->fn[0]->number;
	    CT_ASSERT(fn_number < fn_active_array_size);
	    /* This assertion is not correct for reentrant
	     * signal handlers */
	    /* CT_ASSERT(fn_active_array[fn_number] == 1); */
	    fn_active_array[fn_number]--;

	    current_nonskipped = 0; /* we didn't skip this function */
	    /* back to current context */
	    push_cxt( current_bbcc->cxt->fn[0] );
	    push_call_stack(source_bbcc, current_bbcc, (Addr)-1, False);
	    jcc = call_stack[call_stack_sp-1].jcc;
	    /* assume this call is lasting since last dump or
	     * for a signal handler since it's call */
	    if (current_sigNum == 0)
		copy_fcc( &(call_stack[call_stack_sp-1].fcc),
			  &(thread[current_tid]->last) );
	    else
		init_fcc( &(call_stack[call_stack_sp-1].fcc) );

	    pop_call_stack();
	}
	else {
	    if (unwind_call_stack(esp)==0) {

		/* return without call, simulate with RET/CALL */
		CT_DEBUG(1,"     RET without call!\n");

		/* change source for delayed push */
		if (call_stack[call_stack_sp-1].jcc) {
		    current_bbcc = call_stack[call_stack_sp-1].jcc->from;
		    esp = call_stack[call_stack_sp-1].esp;
		    pop_call_stack();
		}
		else {
		    CT_ASSERT(current_nonskipped != 0);
		}

		skip = new_bb_fn(bb)->skip;
		delayed_push = True;
	    }
	}
    }
    else {
	unwind_call_stack(esp);

	if (bbcc_jmpkind == JmpCall) {
	    delayed_push = True;

	    if (call_emulation && call_stack_sp>0)
                esp = call_stack[call_stack_sp-1].esp;
	}
    }


    /* Change new context if needed, taking delayed_push into account */
    if ((delayed_push && !skip) || (current_cxt == 0)) {
	push_cxt(new_bb_fn(bb));
    }
    CT_ASSERT(fn_stack_top > fn_stack);

    /* If there is a fresh instrumented BBCC, assign current context */
    CT_ASSERT(bb->last_bbcc != 0);
    if (bb->last_bbcc->cxt == 0) {
	bbcc = bb->last_bbcc;
	CT_ASSERT(bbcc->rec_array == 0);
	
	bbcc->cxt = current_cxt;
	bbcc->rec_array = new_recursion((*fn_stack_top)->fn_recursion);
	bbcc->rec_array[0] = bbcc;

	insert_bbcc_into_hash(bbcc);
    }
    else {
	/* get BBCC with current context */

	/* first check LRU of last bbcc executed */
	bbcc = (current_bbcc) ? current_bbcc->lru_next_bbcc : 0;
	if (bbcc && ((bbcc->bb != bb) || (bbcc->cxt != current_cxt))) bbcc = 0;

	if (!bbcc)
	    bbcc = lookup_bbcc(bb, 0, current_cxt);
	if (!bbcc)
	    bbcc = clone_bbcc(bb->bbcc_list, current_cxt, 0);

	bb->last_bbcc = bbcc;
    }

    /* save for fast lookup */
    if (current_bbcc) current_bbcc->lru_next_bbcc = bbcc;

    if ((*fn_stack_top)->fn_recursion>1) {
	UInt level, idx;

	CT_ASSERT((*fn_stack_top)->number < fn_active_array_size);
	level = fn_active_array[(*fn_stack_top)->number];

	if (delayed_push && !skip) {
	    if (clo_skip_direct_recursion) {
		/* do not increment rec. level if called from
		 * same function */
		if (!current_bbcc || 
		    (current_bbcc->cxt->fn[0] != bbcc->cxt->fn[0]))
		    level++;
	    }
	    else level++;
	}
	if (level>(*fn_stack_top)->fn_recursion)
	    level = (*fn_stack_top)->fn_recursion;

	/* if (level<1) level = 1; */

	CT_ASSERT(level>0);
	idx = level -1;

	if (bbcc->rec_array[idx])
	    bbcc = bbcc->rec_array[idx];
	else
	    bbcc = clone_bbcc(bbcc, current_cxt, idx);

	CT_ASSERT(bbcc->rec_array[bbcc->rec_index] == bbcc);
    }

    if (delayed_push) {
	if (!skip && current_nonskipped) {
	    /* a call from skipped to nonskipped */
	    current_bbcc = current_nonskipped;
	}
	push_call_stack(current_bbcc, bbcc, esp, skip);
    }

    if (bbcc_jmpkind == JmpCond || bbcc_jmpkind == JmpBoring) {
	/* Handle conditional jumps followed, i.e. trace arcs
	 * This uses JCC structures, too */

	jCC* jcc = get_jcc(current_bbcc, bbcc);
	CT_ASSERT(jcc != 0);
	// Change from default, and check if already changed
	if (jcc->jmpkind == JmpCall)
	    jcc->jmpkind = bbcc_jmpkind;
	else
	    CT_ASSERT(jcc->jmpkind == bbcc_jmpkind);

	jcc->call_counter++;
	if (bbcc_jmpkind == JmpCond)
	    jcnd_counter++;
	else
	    jump_counter++;
    }

    bbcc_jmpkind = JmpNone;
    current_bbcc = bbcc;

    CT_DEBUGIF(1) {
	VG_(printf)("     ");
	print_bbcc_fn(bbcc);
	VG_(printf)("\n");
    }

    CT_DEBUG(3,"- setup_bbcc (BB 0x%x): 0x%p,", bb->addr, bbcc);
    CT_DEBUGIF(3)
	print_cxt(-2, current_cxt, current_bbcc->rec_index);
    CT_DEBUG(3,"\n");
    
    if (collect_state && !current_nonskipped) bbcc->exe_counter++;
    bb_executions++;

#if JCC_DEBUG
    /* Basic block number has changed: print BB# on next debug output */
    jcc_debug_bb_written = False;
#endif

    if (!clo_simulate_cache && collect_state) {
	/* even with skipping the log_ calls, we have to increment
	 * the global counter */
	current_fcc->Ir.a += bbcc->bb->instr_count;
    }

    VGP_POPCC(VgpCacheSetup);
}



static __inline__
void inc_cc_counters(int miss, CC* cc, CC* current_cc)
{
   cc->a++;
   current_cc->a++;
   if (miss == 0) return;

   cc->m1++;
   current_cc->m1++;
   if (miss==1) return;

   cc->m2++;
   current_cc->m2++;

   CT_ASSERT(miss==2);
}



static __attribute__ ((regparm (1)))
void log_1I_0D_cache_access(UInt diff_cc)
{
   Int miss;
   iCC* cc;

   if (!current_bbcc) return;

   VGP_PUSHCC(VgpCacheSimulate);
   cc = (iCC*)((Addr)current_bbcc + diff_cc);
   
   CT_DEBUG(6,"1I_0D: CCaddr=0x%x, iaddr=0x%x, isize=%u\n",
               cc, cc->instr_addr, cc->instr_size);
   miss = cachesim_I1_doref(cc->instr_addr, cc->instr_size);
   if (collect_state) {
       CC* I = current_nonskipped ?
	   &(current_nonskipped->skipped->fcc.Ir) : &(cc->I);
       
       inc_cc_counters(miss, I, &(current_fcc->Ir) );
   }
   VGP_POPCC(VgpCacheSimulate);
}

/* Difference between this function and log_1I_0D_cache_access() is that
   this one can be passed any kind of CC, not just an iCC.  So we have to
   be careful to make sure we don't make any assumptions about CC layout.
   (As it stands, they would be safe, but this will avoid potential heartache
   if anyone else changes CC layout.)  
   Note that we only do the switch for the JIFZ version because if we always
   called this switching version, things would run about 5% slower. */
static __attribute__ ((regparm (1)))
void log_1I_0D_cache_access_JIFZ(UInt diff_cc)
{
   Int miss;
   UChar instr_size;
   Addr instr_addr;
   CC* I;
   iCC* cc;

   if (!current_bbcc) return;

   VGP_PUSHCC(VgpCacheSimulate);
   cc = (iCC*)((Addr)current_bbcc + diff_cc);

   switch(cc->tag) {
       case InstrCC:
           instr_size = cc->instr_size;
           instr_addr = cc->instr_addr;
           I = &(cc->I);
           break;
       case ReadCC:
       case WriteCC:
       case ModCC:
           instr_size = ((idCC*)cc)->instr_size;
           instr_addr = ((idCC*)cc)->instr_addr;
           I = &( ((idCC*)cc)->I );
           break;
       case ReadWriteCC:
           instr_size = ((iddCC*)cc)->instr_size;
           instr_addr = ((iddCC*)cc)->instr_addr;
           I = &( ((iddCC*)cc)->I );
           break;
       default:
           VG_(skin_panic)("Unknown CC type in log_1I_0D_cache_access_JIFZ()\n");
           break;
   }
   CT_DEBUG(6,"1I_0D: CCaddr=0x%x, iaddr=0x%x, isize=%u\n",
               cc, instr_addr, instr_size);
   miss = cachesim_I1_doref(instr_addr, instr_size);
   if (collect_state) {
       if (current_nonskipped) I = &(current_nonskipped->skipped->fcc.Ir);

       inc_cc_counters(miss, I, &(current_fcc->Ir) );
  }
   VGP_POPCC(VgpCacheSimulate);
}

__attribute__ ((regparm (2))) static 
void log_0I_1D_cache_access(UInt diff_cc, Addr data_addr)
{
   Int missD;
   CC* current_D;
   idCC* cc;

   if (!current_bbcc) return;

   VGP_PUSHCC(VgpCacheSimulate);
   cc = (idCC*)((Addr)current_bbcc + diff_cc);

   current_D = (cc->tag == WriteCC) ?
	       &(current_fcc->Dw) : &(current_fcc->Dr);
   CT_ASSERT(cc->tag == ReadCC || cc->tag == WriteCC || cc->tag == ModCC);

   CT_DEBUG(6,"0I_1D: CCaddr=%p, iaddr=%x, isize=%u, daddr=%x, dsize=%u\n",
               cc, cc->instr_addr, cc->instr_size, data_addr, cc->data_size);

   missD = cachesim_D1_doref(data_addr, cc->data_size);
   if (collect_state) {
       CC* D;

       if (current_nonskipped) {
	   D = (cc->tag == WriteCC) ?
	       &(current_nonskipped->skipped->fcc.Dw) :
	       &(current_nonskipped->skipped->fcc.Dr);
       }
       else
	   D = &(cc->D);

       inc_cc_counters(missD, D, current_D );
   }
   VGP_POPCC(VgpCacheSimulate);
}

__attribute__ ((regparm (2))) static
void log_1I_1D_cache_access(UInt diff_cc, Addr data_addr)
{
   Int missI, missD;
   CC* current_D;
   idCC* cc;

   if (!current_bbcc) return;

   VGP_PUSHCC(VgpCacheSimulate);
   cc = (idCC*)((Addr)current_bbcc + diff_cc);

   CT_DEBUG(6,"1I_1D: CCaddr=%p, iaddr=%x, isize=%u, daddr=%x, dsize=%u\n",
               cc, cc->instr_addr, cc->instr_size, data_addr, cc->data_size);

   current_D = (cc->tag == WriteCC) ?
	       &(current_fcc->Dw) : &(current_fcc->Dr);

   CT_ASSERT(cc->tag == ReadCC || cc->tag == WriteCC || cc->tag == ModCC);

   missI = cachesim_I1_doref(cc->instr_addr, cc->instr_size);
   missD = cachesim_D1_doref(data_addr, cc->data_size);
   if (collect_state) {
       CC *D, *I;

       if (current_nonskipped) {
	   D = (cc->tag == WriteCC) ?
	       &(current_nonskipped->skipped->fcc.Dw) :
	       &(current_nonskipped->skipped->fcc.Dr);
	   I = &(current_nonskipped->skipped->fcc.Ir);
       }
       else {
	   D = &(cc->D);
	   I = &(cc->I);
       }

       inc_cc_counters(missI, I, &(current_fcc->Ir) );
       inc_cc_counters(missD, D, current_D);
   }

   VGP_POPCC(VgpCacheSimulate);
}

__attribute__ ((regparm (3))) static 
void log_0I_2D_cache_access(UInt diff_cc, Addr data_addr1, Addr data_addr2)
{
   Int missD1, missD2;
   iddCC* cc;

   if (!current_bbcc) return;

   VGP_PUSHCC(VgpCacheSimulate);
   cc = (iddCC*)((Addr)current_bbcc + diff_cc);

   CT_ASSERT(cc->tag == ReadWriteCC);

   CT_DEBUG(6,"0I_2D: CCaddr=%p, iaddr=%x, isize=%u, daddr1=0x%x, daddr2=%x, dsize=%u\n",
               cc, cc->instr_addr, cc->instr_size, data_addr1, data_addr2, cc->data_size);

   missD1 = cachesim_D1_doref(data_addr1, cc->data_size);
   missD2 = cachesim_D1_doref(data_addr2, cc->data_size);
   if (collect_state) {
       /* for READ_WRITE_CC, addr1/Da is read, addr2/Db is write access */
       CC *Da, *Db;

       if (current_nonskipped) {
	   Da = &(current_nonskipped->skipped->fcc.Dr);
	   Db = &(current_nonskipped->skipped->fcc.Dw);
       }
       else {
	   Da = &(cc->Da);
	   Db = &(cc->Db);
       }

       inc_cc_counters(missD1, Da, &(current_fcc->Dr) );
       inc_cc_counters(missD2, Db, &(current_fcc->Dw) );
   }
   VGP_POPCC(VgpCacheSimulate);
}

__attribute__ ((regparm (3))) static
void log_1I_2D_cache_access(UInt diff_cc, Addr data_addr1, Addr data_addr2)
{
   Int missI, missD1, missD2;
   iddCC* cc;

   if (!current_bbcc) return;

   VGP_PUSHCC(VgpCacheSimulate);
   cc = (iddCC*)((Addr)current_bbcc + diff_cc);

   CT_ASSERT(cc->tag == ReadWriteCC);

   CT_DEBUG(6,"1I_2D: CCaddr=%p, iaddr=%x, isize=%u, daddr1=%x, daddr2=%x, dsize=%u\n",
               cc, cc->instr_addr, cc->instr_size, data_addr1, data_addr2, cc->data_size);

   missI = cachesim_I1_doref(cc->instr_addr, cc->instr_size);
   missD1 = cachesim_D1_doref(data_addr1, cc->data_size);
   missD2 = cachesim_D1_doref(data_addr2, cc->data_size);

   if (collect_state) {
       CC *I, *Da, *Db;

       if (current_nonskipped) {
	   I  = &(current_nonskipped->skipped->fcc.Ir);
	   Da = &(current_nonskipped->skipped->fcc.Dr);
	   Db = &(current_nonskipped->skipped->fcc.Dw);
       }
       else {
	   I  = &(cc->I);
	   Da = &(cc->Da);
	   Db = &(cc->Db);
       }

       inc_cc_counters(missI, I, &(current_fcc->Ir) );
       /* for READ_WRITE_CC, addr1/Da is read, addr2/Db is write access */
       inc_cc_counters(missD1, Da, &(current_fcc->Dr) );
       inc_cc_counters(missD2, Db, &(current_fcc->Dw) );
   }
   VGP_POPCC(VgpCacheSimulate);
}




UCodeBlock* SK_(instrument)(UCodeBlock* cb_in, Addr orig_addr)
{
/* Use this rather than eg. -1 because it's a UInt. */
#define INVALID_DATA_SIZE   999999

   UCodeBlock* cb;
   Int         i;
   UInstr*     u_in;
   BBCC*       BBCC_node;
   Int         t_CC_addr, t_read_addr, t_write_addr, t_data_addr1,
               t_data_addr2, t_read, t_write;
   Int         CC_size = -1;    /* Shut gcc warnings up */
   Addr        x86_instr_addr = orig_addr;
   UInt        instr_count = 0;
   UInt        x86_instr_size, data_size = INVALID_DATA_SIZE;
   Addr        helper;
   Int         argc;
   UInt        stack_used;
   Bool        BB_seen_before     = False;
   Bool        instrumented_Jcond = False;
   Bool        has_rep_prefix     = False;
   Addr        BBCC_ptr0, BBCC_ptr; 

   CT_DEBUG(3, "+ instrument(BB 0x%x)\n", orig_addr);

   /* Get BBCC (creating if necessary -- requires a counting pass over the BB
    * if it's the first time it's been seen), and point to start of the 
    * BBCC array.
    */
   BBCC_node = get_bbcc(orig_addr, cb_in,
			/*remove=*/False, &BB_seen_before);
   BBCC_ptr0 = BBCC_ptr = (Addr)(BBCC_node->array);

   cb = VG_(setup_UCodeBlock)(cb_in);

   t_CC_addr = t_read_addr = t_write_addr = t_data_addr1 = t_data_addr2 =
               t_read = t_write = INVALID_TEMPREG;
   
   /* 
    * Precondition:
    * - bbcc_jmpkind has the kind of last jump executed (CALL/RET/COND...)
    * - current_bbcc has a pointer to the BBCC of the last executed BB
    *   Thus, if bbcc_jmpkind is != -1 (JmpNone),
    *     current_bbcc->jmp_addr
    *   gives the address of the jump source.
    *   
    * The BBCC setup does 2 things:
    * - trace call:
    *   * Unwind own call stack, i.e sync our ESP with real ESP
    *     This is for ESP manipulation (longjmps, C++ exec handling) and RET
    *   * For CALLs or JMPs crossing objects, record call arg +
    *     push are on own call stack
    *
    * - prepare for cache log functions:
    *   Set current_bbcc to BBCC that gets the costs for this BB execution
    *   attached
    */

  /* 1st arg: BBCC addr */
  t_CC_addr = newTemp(cb);
  uInstr2(cb, MOV,   4, Literal, 0, TempReg, t_CC_addr);
  uLiteral(cb, (Addr)(BBCC_node->bb));
  uInstr1(cb, CCALL, 0, TempReg, t_CC_addr);
  uCCall(cb, (Addr) & setup_bbcc, 1, 1, False);

  for (i = 0; i < VG_(get_num_instrs)(cb_in); i++) {
      u_in = VG_(get_instr)(cb_in, i);

      /* What this is all about:  we want to instrument each x86 instruction 
       * translation.  The end of these are marked in three ways.  The three
       * ways, and the way we instrument them, are as follows:
       *
       * 1. UCode, INCEIP         --> UCode, Instrumentation, INCEIP
       * 2. UCode, Juncond        --> UCode, Instrumentation, Juncond
       * 3. UCode, Jcond, Juncond --> UCode, Instrumentation, Jcond, Juncond
       *
       * The last UInstr in a basic block is always a Juncond.  Jconds,
       * when they appear, are always second last.  We check this with 
       * various assertions.
       *
       * We must put the instrumentation before any jumps so that it is always
       * executed.  We don't have to put the instrumentation before the INCEIP
       * (it could go after) but we do so for consistency.
       *
       * x86 instruction sizes are obtained from INCEIPs (for case 1) or
       * from .extra4b field of the final JMP (for case 2 & 3).
       *
       * Note that JIFZ is treated differently.
       *
       * The instrumentation is just a call to the appropriate helper function,
       * passing it the address of the instruction's CC.
       */
      if (instrumented_Jcond) CT_ASSERT(u_in->opcode == JMP);

      switch (u_in->opcode) {
         case NOP:  case LOCK:  case CALLM_E:  case CALLM_S:
            break;

         /* For memory-ref instrs, copy the data_addr into a temporary to be
          * passed to the cachesim_* helper at the end of the instruction.
          */
         case LOAD: 
            t_read      = u_in->val1;
            t_read_addr = newTemp(cb);
            uInstr2(cb, MOV, 4, TempReg, u_in->val1,  TempReg, t_read_addr);
            data_size = u_in->size;
            VG_(copy_UInstr)(cb, u_in);
            break;

         case MMX2_MemRd:
            CT_ASSERT(u_in->size == 4 || u_in->size == 8);
            /* fall through */

         case FPU_R:
            t_read      = u_in->val2;
            t_read_addr = newTemp(cb);
            uInstr2(cb, MOV, 4, TempReg, u_in->val2,  TempReg, t_read_addr);
            data_size = ( u_in->size <= MIN_LINE_SIZE
                        ? u_in->size
                        : MIN_LINE_SIZE);
            VG_(copy_UInstr)(cb, u_in);
            break;

      case SSE2a_MemRd:
      case SSE2a1_MemRd:
            CT_ASSERT(u_in->size == 4 || u_in->size == 8 || u_in->size == 16 || u_in->size == 512);
	    t_read = u_in->val3;
            t_read_addr = newTemp(cb);
            uInstr2(cb, MOV, 4, TempReg, u_in->val3,  TempReg, t_read_addr);
            /* 512 B data-sized instructions will be done inaccurately
             * but they're very rare and this avoids errors from
             * hitting more than two cache lines in the simulation. */
            data_size = ( u_in->size <= MIN_LINE_SIZE
                        ? u_in->size
                        : MIN_LINE_SIZE);
            VG_(copy_UInstr)(cb, u_in);
            break;

      case SSE3a_MemRd:
            CT_ASSERT(u_in->size == 4 || u_in->size == 8 || u_in->size == 16);
	    t_read = u_in->val3;
            t_read_addr = newTemp(cb);
            uInstr2(cb, MOV, 4, TempReg, u_in->val3,  TempReg, t_read_addr);
            data_size = u_in->size;
            VG_(copy_UInstr)(cb, u_in);
            break;

#if VG_CORE_INTERFACE_MAJOR_VERSION > 3
	    /* Supported since VG-20031104 */
      case SSE3a1_MemRd:
            CT_ASSERT(u_in->size == 8 || u_in->size == 16);
	    t_read = u_in->val3;
            t_read_addr = newTemp(cb);
            uInstr2(cb, MOV, 4, TempReg, u_in->val3,  TempReg, t_read_addr);
            data_size = u_in->size;
            VG_(copy_UInstr)(cb, u_in);
            break;
#endif

      case SSE3ag_MemRd_RegWr:
            CT_ASSERT(u_in->size == 4 || u_in->size == 8);
	    t_read = u_in->val1;
            t_read_addr = newTemp(cb);
            uInstr2(cb, MOV, 4, TempReg, u_in->val1,  TempReg, t_read_addr);
            data_size = u_in->size;
            VG_(copy_UInstr)(cb, u_in);
            break;

         /* Note that we must set t_write_addr even for mod instructions;
          * That's how the code above determines whether it does a write.
          * Without it, it would think a mod instruction is a read.
          * As for the MOV, if it's a mod instruction it's redundant, but it's
          * not expensive and mod instructions are rare anyway. */
         case MMX2_MemWr:
            CT_ASSERT(u_in->size == 4 || u_in->size == 8);
            /* fall through */

         case STORE:
         case FPU_W:
            t_write      = u_in->val2;
            t_write_addr = newTemp(cb);
            uInstr2(cb, MOV, 4, TempReg, u_in->val2, TempReg, t_write_addr);
            /* 28 and 108 B data-sized instructions will be done
             * inaccurately but they're very rare and this avoids errors
             * from hitting more than two cache lines in the simulation. */
            data_size = ( u_in->size <= MIN_LINE_SIZE
                        ? u_in->size
                        : MIN_LINE_SIZE);
            VG_(copy_UInstr)(cb, u_in);
            break;

      case SSE2a_MemWr:
            CT_ASSERT(u_in->size == 4 || u_in->size == 8 || u_in->size == 16 || u_in->size == 512);
	    /* fall through */
      case SSE3a_MemWr:
            CT_ASSERT(u_in->size == 4 || u_in->size == 8 || u_in->size == 16 || u_in->size == 512);
	    t_write = u_in->val3;
            t_write_addr = newTemp(cb);
            uInstr2(cb, MOV, 4, TempReg, u_in->val3, TempReg, t_write_addr);
            /* 512 B data-sized instructions will be done inaccurately
             * but they're very rare and this avoids errors from
             * hitting more than two cache lines in the simulation. */
            data_size = ( u_in->size <= MIN_LINE_SIZE
                        ? u_in->size
                        : MIN_LINE_SIZE);
            VG_(copy_UInstr)(cb, u_in);
            break;

         /* For rep-prefixed instructions, log a single I-cache access
          * before the UCode loop that implements the repeated part, which
          * is where the multiple D-cache accesses are logged. */
         case JIFZ:
            has_rep_prefix = True;

	    /* FIXME: If not simulating cache, we should count loops as
	     *        instructions */
	    if (clo_simulate_cache) {

		/* Setup 1st and only arg: CC addr */
		t_CC_addr = newTemp(cb);
		uInstr2(cb, MOV,  4, Literal, 0, TempReg, t_CC_addr);
		uLiteral(cb, (Addr)BBCC_ptr - (Addr)BBCC_node);

		/* Call helper */
		uInstr1(cb, CCALL, 0, TempReg, t_CC_addr);
		uCCall(cb, (Addr) & log_1I_0D_cache_access_JIFZ, 1, 1, False);
	    }
            VG_(copy_UInstr)(cb, u_in);
            break;


         /* INCEIP: insert instrumentation */
         case INCEIP:
            x86_instr_size = u_in->val1;
            goto instrument_x86_instr;

         /* JMP: insert instrumentation if the first JMP */
         case JMP:

            if (instrumented_Jcond) {

		if (clo_trace_jump) {
		    /* Overwrite conditional jmpkind, as not followed */
		    
		    Int tmp = newTemp(cb);
		    Int tmp_jk_addr = newTemp(cb);

		    uInstr2(cb, MOV,   4, Literal, 0, TempReg, tmp_jk_addr);
		    uLiteral(cb, (Addr) &bbcc_jmpkind);
		    uInstr2(cb, MOV,   4, Literal, 0, TempReg, tmp);
		    uLiteral(cb, JmpNone);
		    uInstr2(cb, STORE, 4, TempReg, tmp, TempReg, tmp_jk_addr);
		}

		CT_ASSERT(CondAlways == u_in->cond);
		CT_ASSERT(i+1 == VG_(get_num_instrs)(cb_in));
		VG_(copy_UInstr)(cb, u_in);
		instrumented_Jcond = False;    /* reset */
		break;
            }

            /* The first JMP... instrument. */

	    /* as preparation for the setup_bbcc call at the
	     * beginning of the next basic block, we store the
	     * jmpkind into a global variable if its a Call or Ret.
	     *
	     * if we instrument conditional jumps, use -2 (JmpCond) as jmpkind.
	     * Before the final (2nd) jump, jmpkind is reset to -1 (JmpNone).
	     * This way, jmpkind will only be -2 in setup_bbcc if the
	     * conditional jump was followed.
	     */
	    if ( ((CondAlways != u_in->cond) && clo_trace_jump) ||
		 ((CondAlways == u_in->cond) &&
		  ((u_in->jmpkind == JmpCall) ||
		   (u_in->jmpkind == JmpRet) ||
		   (clo_trace_jump && (u_in->jmpkind == JmpBoring)) )) ) {

		Int tmp = newTemp(cb);
		Int tmp_jk_addr = newTemp(cb);

		/* Address of bbcc_jmpkind into temp reg for STORE */
		uInstr2(cb, MOV,   4, Literal, 0, TempReg, tmp_jk_addr);
		uLiteral(cb, (Addr) &bbcc_jmpkind);
		uInstr2(cb, MOV,   4, Literal, 0, TempReg, tmp);
		uLiteral(cb, 
			 (CondAlways != u_in->cond) ? JmpCond : u_in->jmpkind);
		uInstr2(cb, STORE, 4, TempReg, tmp, TempReg, tmp_jk_addr);
	    }
	    
	    if (BB_seen_before) {
		CT_ASSERT(BBCC_node->bb->jmp_addr == x86_instr_addr);
	    }
	    else {
		BBCC_node->bb->jmp_addr = x86_instr_addr;
	    }

            if (CondAlways != u_in->cond) {
               CT_ASSERT(i+2 == VG_(get_num_instrs)(cb_in));
               instrumented_Jcond = True;
            } else {
	       CT_ASSERT(i+1 == VG_(get_num_instrs)(cb_in));
	    }

            /* Get x86 instr size from final JMP. */
            x86_instr_size = VG_(get_last_instr)(cb_in)->extra4b;

            goto instrument_x86_instr;


            /* Code executed at the end of each x86 instruction. */
            instrument_x86_instr:

            /* Initialise the CC in the BBCC array appropriately if it
             * hasn't been initialised before.  Then call appropriate sim
             * function, passing it the CC address. */
            stack_used = 0;

            CT_ASSERT(x86_instr_size >= 1 && 
                      x86_instr_size <= MAX_x86_INSTR_SIZE);

#define IS_(X)      (INVALID_TEMPREG != t_##X##_addr)

            if (!IS_(read) && !IS_(write)) {
               CT_ASSERT(INVALID_DATA_SIZE == data_size);
               CT_ASSERT(INVALID_TEMPREG == t_read_addr  && 
                         INVALID_TEMPREG == t_read       && 
                         INVALID_TEMPREG == t_write_addr &&
                         INVALID_TEMPREG == t_write);
               CC_size = sizeof(iCC);
               if (!BB_seen_before)
                   init_iCC((iCC*)BBCC_ptr, x86_instr_addr, x86_instr_size);
               helper = ( has_rep_prefix 
                        ? (Addr)0      /* no extra log needed */
                        : (Addr) & log_1I_0D_cache_access
                        );
               argc = 1;

            } else { 
               CT_ASSERT(4 == data_size || 2  == data_size || 1 == data_size || 
                         8 == data_size || 10 == data_size ||
                         MIN_LINE_SIZE == data_size);
               
               if (IS_(read) && !IS_(write)) {
                  CC_size = sizeof(idCC);
                  /* If it uses 'rep', we've already logged the I-cache 
                   * access at the JIFZ UInstr (see JIFZ case below) so
                   * don't do it here */
                  helper = ( has_rep_prefix 
                           ? (Addr) & log_0I_1D_cache_access
                           : (Addr) & log_1I_1D_cache_access
                           );
                  argc = 2;
                  if (!BB_seen_before)
                     init_idCC(ReadCC, (idCC*)BBCC_ptr, x86_instr_addr,
                               x86_instr_size, data_size);
                  CT_ASSERT(INVALID_TEMPREG != t_read_addr  && 
                            INVALID_TEMPREG != t_read       && 
                            INVALID_TEMPREG == t_write_addr &&
                            INVALID_TEMPREG == t_write);
                  t_data_addr1 = t_read_addr;

               } else if (!IS_(read) && IS_(write)) {
                  CC_size = sizeof(idCC);
                  helper = ( has_rep_prefix 
                           ? (Addr) & log_0I_1D_cache_access
                           : (Addr) & log_1I_1D_cache_access
                           );
                  argc = 2;
                  if (!BB_seen_before)
                     init_idCC(WriteCC, (idCC*)BBCC_ptr, x86_instr_addr,
                               x86_instr_size, data_size);
                  CT_ASSERT(INVALID_TEMPREG == t_read_addr  && 
                            INVALID_TEMPREG == t_read       && 
                            INVALID_TEMPREG != t_write_addr &&
                            INVALID_TEMPREG != t_write);
                  t_data_addr1 = t_write_addr;

               } else {
                  CT_ASSERT(IS_(read) && IS_(write));
                  CT_ASSERT(INVALID_TEMPREG != t_read_addr  && 
                            INVALID_TEMPREG != t_read       && 
                            INVALID_TEMPREG != t_write_addr &&
                            INVALID_TEMPREG != t_write);
                  if (t_read == t_write) {
                     CC_size = sizeof(idCC);
                     helper = ( has_rep_prefix 
                              ? (Addr) & log_0I_1D_cache_access
                              : (Addr) & log_1I_1D_cache_access
                              );
                     argc = 2;
                     if (!BB_seen_before)
                        init_idCC(ModCC, (idCC*)BBCC_ptr, x86_instr_addr,
                                  x86_instr_size, data_size);
                     t_data_addr1 = t_read_addr;
                  } else {
                     CC_size = sizeof(iddCC);
                     helper = ( has_rep_prefix 
                              ? (Addr) & log_0I_2D_cache_access
                              : (Addr) & log_1I_2D_cache_access
                              );
                     argc = 3;
                     if (!BB_seen_before)
                        init_iddCC((iddCC*)BBCC_ptr, x86_instr_addr,
                                    x86_instr_size, data_size);
                     t_data_addr1 = t_read_addr;
                     t_data_addr2 = t_write_addr;
                  }
               }
#undef IS_
            }

            /* Call the helper, if necessary */
            if (clo_simulate_cache && ((Addr)0 != helper)) {

               /* Setup 1st arg: CC addr offset */
               t_CC_addr = newTemp(cb);
               uInstr2(cb, MOV,   4, Literal, 0, TempReg, t_CC_addr);
               uLiteral(cb, (Addr)BBCC_ptr - (Addr)BBCC_node);

               /* Call the helper */
               if      (1 == argc)
                  uInstr1(cb, CCALL, 0, TempReg, t_CC_addr);
               else if (2 == argc)
                  uInstr2(cb, CCALL, 0, TempReg, t_CC_addr, 
                                        TempReg, t_data_addr1);
               else if (3 == argc)
                  uInstr3(cb, CCALL, 0, TempReg, t_CC_addr, 
                                        TempReg, t_data_addr1,
                                        TempReg, t_data_addr2);
               else
                  VG_(skin_panic)("argc... not 1 or 2 or 3?");
               
               uCCall(cb, helper, argc, argc, False);
	    }

	    /* Count number of instrumented instructions */
	    if (!BB_seen_before) distinct_instrs++;

            /* Copy original UInstr (INCEIP or JMP) */
            VG_(copy_UInstr)(cb, u_in);

	    instr_count++;

            /* Update BBCC_ptr, EIP, de-init read/write temps for next instr */
            if (BBCC_ptr) BBCC_ptr += CC_size; 
            x86_instr_addr += x86_instr_size;
            t_CC_addr = t_read_addr = t_write_addr = t_data_addr1 = 
                        t_data_addr2 = t_read = t_write = INVALID_TEMPREG;
            data_size = INVALID_DATA_SIZE;
            has_rep_prefix = False; 
            break;

         default:
            VG_(copy_UInstr)(cb, u_in);
            break;
      }
   }

   /* Just check everything looks ok */
   CT_ASSERT(BBCC_ptr - BBCC_ptr0 == BBCC_node->bb->array_size);

   VG_(free_UCodeBlock)(cb_in);

   BBCC_node->bb->size = x86_instr_addr - orig_addr;
   BBCC_node->bb->instr_count = instr_count;

   CT_DEBUG(3, "- instrument(BB 0x%x)\n", orig_addr);

   return cb;

#undef INVALID_DATA_SIZE
}

/*------------------------------------------------------------*/
/*--- Automagic cache initialisation stuff                 ---*/
/*------------------------------------------------------------*/

#define UNDEFINED_CACHE     ((cache_t) { -1, -1, -1 }) 

static cache_t clo_I1_cache = UNDEFINED_CACHE;
static cache_t clo_D1_cache = UNDEFINED_CACHE;
static cache_t clo_L2_cache = UNDEFINED_CACHE;

/* All CPUID info taken from sandpile.org/a32/cpuid.htm */
/* Probably only works for Intel and AMD chips, and probably only for some of
 * them. 
 */

static __inline__ void cpuid(Int n, UInt *a, UInt *b, UInt *c, UInt *d)
{
   __asm__ __volatile__ (
    "cpuid"
    : "=a" (*a), "=b" (*b), "=c" (*c), "=d" (*d)      /* output */
    : "0" (n)         /* input */
    );
}

static void micro_ops_warn(Int actual_size, Int used_size, Int line_size)
{
    VG_(message)(Vg_DebugMsg, 
       "warning: Pentium with %d K micro-op instruction trace cache", 
       actual_size);
    VG_(message)(Vg_DebugMsg, 
       "         Simulating a %d KB cache with %d B lines", 
       used_size, line_size);
}

/* Intel method is truly wretched.  We have to do an insane indexing into an
 * array of pre-defined configurations for various parts of the memory
 * hierarchy. 
 */
static
Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* L2c)
{
   UChar info[16];
   Int   i, trials;
   Bool  L2_found = False;

   if (level < 2) {
      VG_(message)(Vg_DebugMsg, 
         "warning: CPUID level < 2 for Intel processor (%d)", 
         level);
      return -1;
   }

   cpuid(2, (Int*)&info[0], (Int*)&info[4], 
            (Int*)&info[8], (Int*)&info[12]);
   trials  = info[0] - 1;   /* AL register - bits 0..7 of %eax */
   info[0] = 0x0;           /* reset AL */

   if (0 != trials) {
      VG_(message)(Vg_DebugMsg, 
         "warning: non-zero CPUID trials for Intel processor (%d)",
         trials);
      return -1;
   }

   for (i = 0; i < 16; i++) {

      switch (info[i]) {

      case 0x0:       /* ignore zeros */
          break;
          
      /* TLB info, ignore */
      case 0x01: case 0x02: case 0x03: case 0x04:
      case 0x50: case 0x51: case 0x52: case 0x5b: case 0x5c: case 0x5d:
      case 0xb0: case 0xb3:
          break;      

      case 0x06: *I1c = (cache_t) {  8, 4, 32 }; break;
      case 0x08: *I1c = (cache_t) { 16, 4, 32 }; break;
      case 0x30: *I1c = (cache_t) { 32, 8, 64 }; break;

      case 0x0a: *D1c = (cache_t) {  8, 2, 32 }; break;
      case 0x0c: *D1c = (cache_t) { 16, 4, 32 }; break;
      case 0x2c: *D1c = (cache_t) {  32, 8, 64 }; break;

      /* IA-64 info -- panic! */
      case 0x10: case 0x15: case 0x1a: 
      case 0x88: case 0x89: case 0x8a: case 0x8d:
      case 0x90: case 0x96: case 0x9b:
         VG_(message)(Vg_DebugMsg,
            "error: IA-64 cache stats!  Cachegrind doesn't run on IA-64...");
         VG_(skin_panic)("IA-64 detected");

      case 0x22: case 0x23: case 0x25: case 0x29: 
          VG_(message)(Vg_DebugMsg, 
             "warning: L3 cache detected but ignored\n");
          break;

      /* These are sectored, whatever that means */
      case 0x39: *L2c = (cache_t) {  128, 4, 64 }; L2_found = True; break;
      case 0x3c: *L2c = (cache_t) {  256, 4, 64 }; L2_found = True; break;

      /* If a P6 core, this means "no L2 cache".  
         If a P4 core, this means "no L3 cache".
         We don't know what core it is, so don't issue a warning.  To detect
         a missing L2 cache, we use 'L2_found'. */
      case 0x40:
          break;

      case 0x41: *L2c = (cache_t) {  128, 4, 32 }; L2_found = True; break;
      case 0x42: *L2c = (cache_t) {  256, 4, 32 }; L2_found = True; break;
      case 0x43: *L2c = (cache_t) {  512, 4, 32 }; L2_found = True; break;
      case 0x44: *L2c = (cache_t) { 1024, 4, 32 }; L2_found = True; break;
      case 0x45: *L2c = (cache_t) { 2048, 4, 32 }; L2_found = True; break;

      /* These are sectored, whatever that means */
      case 0x66: *D1c = (cache_t) {  8, 4, 64 };  break;      /* sectored */
      case 0x67: *D1c = (cache_t) { 16, 4, 64 };  break;      /* sectored */
      case 0x68: *D1c = (cache_t) { 32, 4, 64 };  break;      /* sectored */

      /* HACK ALERT: Instruction trace cache -- capacity is micro-ops based.
       * conversion to byte size is a total guess;  treat the 12K and 16K
       * cases the same since the cache byte size must be a power of two for
       * everything to work!.  Also guessing 32 bytes for the line size... 
       */
      case 0x70:    /* 12K micro-ops, 8-way */
         *I1c = (cache_t) { 16, 8, 32 };  
         micro_ops_warn(12, 16, 32);
         break;  
      case 0x71:    /* 16K micro-ops, 8-way */
         *I1c = (cache_t) { 16, 8, 32 };  
         micro_ops_warn(16, 16, 32); 
         break;  
      case 0x72:    /* 32K micro-ops, 8-way */
         *I1c = (cache_t) { 32, 8, 32 };  
         micro_ops_warn(32, 32, 32); 
         break;  

      /* These are sectored, whatever that means */
      case 0x79: *L2c = (cache_t) {  128, 8,  64 }; L2_found = True;  break;
      case 0x7a: *L2c = (cache_t) {  256, 8,  64 }; L2_found = True;  break;
      case 0x7b: *L2c = (cache_t) {  512, 8,  64 }; L2_found = True;  break;
      case 0x7c: *L2c = (cache_t) { 1024, 8,  64 }; L2_found = True;  break;
      case 0x7e: *L2c = (cache_t) {  256, 8, 128 }; L2_found = True;  break;

      case 0x81: *L2c = (cache_t) {  128, 8, 32 };  L2_found = True;  break;
      case 0x82: *L2c = (cache_t) {  256, 8, 32 };  L2_found = True;  break;
      case 0x83: *L2c = (cache_t) {  512, 8, 32 };  L2_found = True;  break;
      case 0x84: *L2c = (cache_t) { 1024, 8, 32 };  L2_found = True;  break;
      case 0x85: *L2c = (cache_t) { 2048, 8, 32 };  L2_found = True;  break;
      case 0x86: *L2c = (cache_t) {  512, 4, 64 };  L2_found = True;  break;
      case 0x87: *L2c = (cache_t) { 1024, 8, 64 };  L2_found = True;  break;

      default:
          VG_(message)(Vg_DebugMsg, 
             "warning: Unknown Intel cache config value "
             "(0x%x), ignoring", info[i]);
          break;
      }
   }

   if (!L2_found)
      VG_(message)(Vg_DebugMsg, 
         "warning: L2 cache not installed, ignore L2 results.");

   return 0;
}

/* AMD method is straightforward, just extract appropriate bits from the
 * result registers.
 *
 * Bits, for D1 and I1:
 *  31..24  data L1 cache size in KBs    
 *  23..16  data L1 cache associativity (FFh=full)    
 *  15.. 8  data L1 cache lines per tag    
 *   7.. 0  data L1 cache line size in bytes
 *
 * Bits, for L2:
 *  31..16  unified L2 cache size in KBs
 *  15..12  unified L2 cache associativity (0=off, FFh=full)
 *  11.. 8  unified L2 cache lines per tag    
 *   7.. 0  unified L2 cache line size in bytes
 *
 * #3  The AMD K7 processor's L2 cache must be configured prior to relying 
 *     upon this information. (Whatever that means -- njn)
 *
 * Also, according to Cyrille Chepelov, Duron stepping A0 processors (model
 * 0x630) have a bug and misreport their L2 size as 1KB (it's really 64KB),
 * so we detect that.
 * 
 * Returns 0 on success, non-zero on failure.
 */
static
Int AMD_cache_info(cache_t* I1c, cache_t* D1c, cache_t* L2c)
{
   UInt ext_level;
   Int dummy, model;
   Int I1i, D1i, L2i;
   
   cpuid(0x80000000, &ext_level, &dummy, &dummy, &dummy);

   if (0 == (ext_level & 0x80000000) || ext_level < 0x80000006) {
      VG_(message)(Vg_UserMsg, 
         "warning: ext_level < 0x80000006 for AMD processor (0x%x)", 
         ext_level);
      return -1;
   }

   cpuid(0x80000005, &dummy, &dummy, &D1i, &I1i);
   cpuid(0x80000006, &dummy, &dummy, &L2i, &dummy);

   cpuid(0x1, &model, &dummy, &dummy, &dummy);
   /*VG_(message)(Vg_UserMsg,"CPU model %04x",model);*/

   /* Check for Duron bug */
   if (model == 0x630) {
      VG_(message)(Vg_UserMsg,
         "Buggy Duron stepping A0. Assuming L2 size=65536 bytes");
      L2i = (64 << 16) | (L2i & 0xffff);
   }

   D1c->size      = (D1i >> 24) & 0xff;
   D1c->assoc     = (D1i >> 16) & 0xff;
   D1c->line_size = (D1i >>  0) & 0xff;

   I1c->size      = (I1i >> 24) & 0xff;
   I1c->assoc     = (I1i >> 16) & 0xff;
   I1c->line_size = (I1i >>  0) & 0xff;

   L2c->size      = (L2i >> 16) & 0xffff; /* Nb: different bits used for L2 */
   L2c->assoc     = (L2i >> 12) & 0xf;
   L2c->line_size = (L2i >>  0) & 0xff;

   return 0;
}

static jmp_buf cpuid_jmpbuf;

static
void cpuid_SIGILL_handler(int signum)
{
   __builtin_longjmp(cpuid_jmpbuf, 1);
}

static 
Int get_caches_from_CPUID(cache_t* I1c, cache_t* D1c, cache_t* L2c)
{
   Int  level, res, ret;
   Char vendor_id[13];
   vki_ksigaction sigill_new, sigill_saved;

   /* Install own SIGILL handler */
   sigill_new.ksa_handler  = cpuid_SIGILL_handler;
   sigill_new.ksa_flags    = 0;
   sigill_new.ksa_restorer = NULL;
   res = VG_(ksigemptyset)( &sigill_new.ksa_mask );
   CT_ASSERT(res == 0);

   res = VG_(ksigaction)( VKI_SIGILL, &sigill_new, &sigill_saved );
   CT_ASSERT(res == 0);

   /* Trap for illegal instruction, in case it's a really old processor that
    * doesn't support CPUID. */
   if (__builtin_setjmp(cpuid_jmpbuf) == 0) {
      cpuid(0, &level, (int*)&vendor_id[0], 
                       (int*)&vendor_id[8], (int*)&vendor_id[4]);    
      vendor_id[12] = '\0';

      /* Restore old SIGILL handler */
      res = VG_(ksigaction)( VKI_SIGILL, &sigill_saved, NULL );
      CT_ASSERT(res == 0);

   } else  {
      VG_(message)(Vg_DebugMsg, "CPUID instruction not supported");

      /* Restore old SIGILL handler */
      res = VG_(ksigaction)( VKI_SIGILL, &sigill_saved, NULL );
      CT_ASSERT(res == 0);
      return -1;
   }

   if (0 == level) {
      VG_(message)(Vg_DebugMsg, "CPUID level is 0, early Pentium?\n");
      return -1;
   }

   /* Only handling Intel and AMD chips... no Cyrix, Transmeta, etc */
   if (0 == VG_(strcmp)(vendor_id, "GenuineIntel")) {
      ret = Intel_cache_info(level, I1c, D1c, L2c);

   } else if (0 == VG_(strcmp)(vendor_id, "AuthenticAMD")) {
      ret = AMD_cache_info(I1c, D1c, L2c);

   } else if (0 == VG_(strcmp)(vendor_id, "CentaurHauls")) {
      /* Total kludge.  Pretend to be a VIA Nehemiah. */
      D1c->size      = 64;
      D1c->assoc     = 16;
      D1c->line_size = 16;
      I1c->size      = 64;
      I1c->assoc     = 4;
      I1c->line_size = 16;
      L2c->size      = 64;
      L2c->assoc     = 16;
      L2c->line_size = 16;
      ret = 0;

   } else {
      VG_(message)(Vg_DebugMsg, "CPU vendor ID not recognised (%s)",
                   vendor_id);
      return -1;
   }

   /* Successful!  Convert sizes from KB to bytes */
   I1c->size *= 1024;
   D1c->size *= 1024;
   L2c->size *= 1024;
      
   return ret;
}

/* Checks cache config is ok;  makes it so if not. */
static 
void check_cache(cache_t* cache, cache_t* dflt, Char *name)
{
   /* First check they're all powers of two */
   if (-1 == VG_(log2)(cache->size)) {
      VG_(message)(Vg_UserMsg,
         "warning: %s size of %dB not a power of two; "
         "defaulting to %dB", name, cache->size, dflt->size);
      cache->size = dflt->size;
   }

   if (-1 == VG_(log2)(cache->assoc)) {
      VG_(message)(Vg_UserMsg,
         "warning: %s associativity of %d not a power of two; "
         "defaulting to %d-way", name, cache->assoc, dflt->assoc);
      cache->assoc = dflt->assoc;
   }

   if (-1 == VG_(log2)(cache->line_size)) {
      VG_(message)(Vg_UserMsg,
         "warning: %s line size of %dB not a power of two; "
         "defaulting to %dB", 
         name, cache->line_size, dflt->line_size);
      cache->line_size = dflt->line_size;
   }

   /* Then check line size >= 16 -- any smaller and a single instruction could
    * straddle three cache lines, which breaks a simulation assertion and is
    * stupid anyway. */
   if (cache->line_size < MIN_LINE_SIZE) {
      VG_(message)(Vg_UserMsg,
         "warning: %s line size of %dB too small; "
         "increasing to %dB", name, cache->line_size, MIN_LINE_SIZE);
      cache->line_size = MIN_LINE_SIZE;
   }

   /* Then check cache size > line size (causes seg faults if not). */
   if (cache->size <= cache->line_size) {
      VG_(message)(Vg_UserMsg,
         "warning: %s cache size of %dB <= line size of %dB; "
         "increasing to %dB", name, cache->size, cache->line_size,
                              cache->line_size * 2);
      cache->size = cache->line_size * 2;
   }

   /* Then check assoc <= (size / line size) (seg faults otherwise). */
   if (cache->assoc > (cache->size / cache->line_size)) {
      VG_(message)(Vg_UserMsg,
         "warning: %s associativity > (size / line size); "
         "increasing size to %dB", 
            name, cache->assoc * cache->line_size);
      cache->size = cache->assoc * cache->line_size;
   }
}

/* On entry, args are undefined.  Fill them with any info from the
 * command-line, then fill in any remaining with CPUID instruction if possible,
 * otherwise use defaults.  Then check them and fix if not ok. */
static 
void get_caches(cache_t* I1c, cache_t* D1c, cache_t* L2c)
{
   /* Defaults are for a model 3 or 4 Athlon */
   cache_t I1_dflt = (cache_t) {  65536, 2, 64 };
   cache_t D1_dflt = (cache_t) {  65536, 2, 64 };
   cache_t L2_dflt = (cache_t) { 262144, 8, 64 };

#define CMD_LINE_DEFINED(L)            \
   (-1 != clo_##L##_cache.size  ||     \
    -1 != clo_##L##_cache.assoc ||     \
    -1 != clo_##L##_cache.line_size)

   *I1c = clo_I1_cache;
   *D1c = clo_D1_cache;
   *L2c = clo_L2_cache;

   /* If any undefined on command-line, try CPUID */
   if (! CMD_LINE_DEFINED(I1) ||
       ! CMD_LINE_DEFINED(D1) ||
       ! CMD_LINE_DEFINED(L2)) { 

      /* Overwrite CPUID result for any cache defined on command-line */
      if (0 == get_caches_from_CPUID(I1c, D1c, L2c)) {
   
         if (CMD_LINE_DEFINED(I1)) *I1c = clo_I1_cache;
         if (CMD_LINE_DEFINED(D1)) *D1c = clo_D1_cache;
         if (CMD_LINE_DEFINED(L2)) *L2c = clo_L2_cache;

      /* CPUID failed, use defaults for each undefined by command-line */
      } else {
         VG_(message)(Vg_DebugMsg, 
                      "Couldn't detect cache configuration, using one "
                      "or more defaults ");

         *I1c = (CMD_LINE_DEFINED(I1) ? clo_I1_cache : I1_dflt);
         *D1c = (CMD_LINE_DEFINED(D1) ? clo_D1_cache : D1_dflt);
         *L2c = (CMD_LINE_DEFINED(L2) ? clo_L2_cache : L2_dflt);
      }
   }
#undef CMD_LINE_DEFINED

   check_cache(I1c, &I1_dflt, "I1");
   check_cache(D1c, &D1_dflt, "D1");
   check_cache(L2c, &L2_dflt, "L2");

   if (VG_(clo_verbosity) > 1) {
      VG_(message)(Vg_UserMsg, "Cache configuration used:");
      VG_(message)(Vg_UserMsg, "  I1: %dB, %d-way, %dB lines",
                               I1c->size, I1c->assoc, I1c->line_size);
      VG_(message)(Vg_UserMsg, "  D1: %dB, %d-way, %dB lines",
                               D1c->size, D1c->assoc, D1c->line_size);
      VG_(message)(Vg_UserMsg, "  L2: %dB, %d-way, %dB lines",
                               L2c->size, L2c->assoc, L2c->line_size);
   }
}



/*------------------------------------------------------------*/
/*--- Output file related stuff                            ---*/
/*------------------------------------------------------------*/

/* Boolean dumping array */
static Bool* dump_array = 0;
static Int   dump_array_size = 0;
static Bool* obj_dumped = 0;
static Bool* file_dumped = 0;
static Bool* fn_dumped = 0;
static Bool* cxt_dumped = 0;

static void reset_dump_array()
{
    int i;

    CT_ASSERT(dump_array != 0);

    for(i=0;i<dump_array_size;i++)
	dump_array[i] = False;
}

static void init_dump_array()
{
    dump_array_size = distinct_objs + distinct_files +
		      distinct_fns + context_counter;
    CT_ASSERT(dump_array == 0);
    dump_array = VG_(malloc)(dump_array_size * sizeof(Bool));
    obj_dumped  = dump_array;
    file_dumped = obj_dumped + distinct_objs;
    fn_dumped   = file_dumped + distinct_files;
    cxt_dumped  = fn_dumped + distinct_fns;

    reset_dump_array();
}

static __inline__
void free_dump_array()
{
    CT_ASSERT(dump_array != 0);
    VG_(free)(dump_array);

    dump_array = 0;
    obj_dumped = 0;
    file_dumped = 0;
    fn_dumped = 0;
    cxt_dumped = 0;
}


/*
 * Structure SPos is a pointer to a source position.
 */

/* A function in a execution context */
typedef struct _FPos FPos;
struct _FPos {
    file_node* file;
    fn_node* fn;
    obj_node* obj;
    Context* cxt;
    int rec_index;
    UInt line;
};

/* Initialize to an invalid position */
static __inline__
void init_fpos(FPos* p)
 {
    p->file = 0;
    p->fn = 0;
    p->obj = 0;
    p->cxt = 0;
    p->rec_index = 0;
}


#if 0
static __inline__
static void fwrite(Int fd, Char* buf, Int len)
{
	VG_(write)(fd, (void*)buf, len);
}
#else

#define FWRITE_BUFSIZE 32000
#define FWRITE_THROUGH 10000
static Char fwrite_buf[FWRITE_BUFSIZE];
static Int fwrite_pos;
static Int fwrite_fd = -1;

static __inline__
void fwrite_flush()
{
    if ((fwrite_fd>0) && (fwrite_pos>0))
	VG_(write)(fwrite_fd, (void*)fwrite_buf, fwrite_pos);
    fwrite_pos = 0;
}

static void fwrite(Int fd, Char* buf, Int len)
{
    if (fwrite_fd != fd) {
	fwrite_flush();
	fwrite_fd = fd;
    }
    if (len > FWRITE_THROUGH) {
	fwrite_flush();
	VG_(write)(fd, (void*)buf, len);
	return;
    }
    if (FWRITE_BUFSIZE - fwrite_pos <= len) fwrite_flush();
    VG_(strncpy)(fwrite_buf + fwrite_pos, buf, len);
    fwrite_pos += len;
}
#endif


static void print_obj(Char* buf, obj_node* obj)
{
    int n;

    if (clo_compress_strings) {
	CT_ASSERT(obj_dumped != 0);
	if (obj_dumped[obj->number])
	    n = VG_(sprintf)(buf, "(%d)\n", obj->number);
	else {
	    n = VG_(sprintf)(buf, "(%d) %s\n",
			     obj->number, obj->name);
	}
    }
    else
	n = VG_(sprintf)(buf, "%s\n", obj->name);

#if 0
    /* add mapping parameters the first time a object is dumped
     * format: mp=0xSTART SIZE 0xOFFSET */
    if (!obj_dumped[obj->number]) {
	obj_dumped[obj->number];
	VG_(sprintf)(buf+n, "mp=0x%x %d 0x%x\n",
		     pos->obj->start, pos->obj->size, pos->obj->offset);
    }
#else
    obj_dumped[obj->number] = True;
#endif
}

static void print_file(Char* buf, file_node* file)
{
    if (clo_compress_strings) {
	CT_ASSERT(file_dumped != 0);
	if (file_dumped[file->number])
	    VG_(sprintf)(buf, "(%d)\n", file->number);
	else {
	    VG_(sprintf)(buf, "(%d) %s\n",
			 file->number, file->name);
	    file_dumped[file->number] = True;
	}
    }
    else
	VG_(sprintf)(buf, "%s\n", file->name);
}

/*
 * tag can be "fn", "cfn", "jfn"
 */
static void print_fn(Int fd, Char* buf, Char* tag, fn_node* fn)
{
    int p;
    p = VG_(sprintf)(buf, "%s=",tag);
    if (clo_compress_strings) {
	CT_ASSERT(fn_dumped != 0);
	if (fn_dumped[fn->number])
	    p += VG_(sprintf)(buf+p, "(%d)\n", fn->number);
	else {
	    p += VG_(sprintf)(buf+p, "(%d) %s\n",
			      fn->number, fn->name);
	    fn_dumped[fn->number] = True;
	}
    }
    else
	p += VG_(sprintf)(buf+p, "%s\n", fn->name);

    fwrite(fd, buf, p);
}

static void print_mangled_fn(Int fd, Char* buf, Char* tag, 
			     Context* cxt, int rec_index)
{
    int p, i;

    if (clo_compress_strings && clo_compress_mangled) {

	int n;
	Context* last;

	CT_ASSERT(cxt_dumped != 0);
	if (cxt_dumped[cxt->base_number+rec_index]) {
	    p = VG_(sprintf)(buf, "%s=(%d)\n",
			     tag, cxt->base_number + rec_index);
	    fwrite(fd, buf, p);
	    return;
	}

	last = 0;
	/* make sure that for all context parts compressed data is written */
	for(i=cxt->size;i>0;i--) {
	    CT_ASSERT(cxt->fn[i-1]->pure_cxt != 0);
	    n = cxt->fn[i-1]->pure_cxt->base_number;
	    if (cxt_dumped[n]) continue;
	    p = VG_(sprintf)(buf, "%s=(%d) %s\n",
			     tag, n, cxt->fn[i-1]->name);
	    fwrite(fd, buf, p);

	    cxt_dumped[n] = True;
	    last = cxt->fn[i-1]->pure_cxt;
	}
	/* If the last context was the context to print, we are finished */
	if ((last == cxt) && (rec_index == 0)) return;

	p = VG_(sprintf)(buf, "%s=(%d) (%d)", tag,
			 cxt->base_number + rec_index,
			 cxt->fn[0]->pure_cxt->base_number);
	if (rec_index >0)
	    p += VG_(sprintf)(buf+p, "'%d", rec_index +1);
	for(i=1;i<cxt->size;i++)
	    p += VG_(sprintf)(buf+p, "'(%d)", 
			      cxt->fn[i]->pure_cxt->base_number);
	p += VG_(sprintf)(buf+p, "\n");
	fwrite(fd, buf, p);

	cxt_dumped[cxt->base_number+rec_index] = True;
	return;
    }


    p = VG_(sprintf)(buf, "%s=", tag);
    if (clo_compress_strings) {
	CT_ASSERT(cxt_dumped != 0);
	if (cxt_dumped[cxt->base_number+rec_index]) {
	    p += VG_(sprintf)(buf+p, "(%d)\n", cxt->base_number + rec_index);
	    fwrite(fd, buf, p);
	    return;
	}
	else {
	    p += VG_(sprintf)(buf+p, "(%d) ", cxt->base_number + rec_index);
	    cxt_dumped[cxt->base_number+rec_index] = True;
	}
    }

    p += VG_(sprintf)(buf+p, "%s", cxt->fn[0]->name);
    if (rec_index >0)
	p += VG_(sprintf)(buf+p, "'%d", rec_index +1);
    for(i=1;i<cxt->size;i++)
	p += VG_(sprintf)(buf+p, "'%s", cxt->fn[i]->name);

    p += VG_(sprintf)(buf+p, "\n");
    fwrite(fd, buf, p);
}



/**
 * Print function position of the BBCC, but only print info differing to
 * the <last> position, update <last>
 * Return True if something changes.
 */
static Bool print_fn_pos(int fd, FPos* last, BBCC* bbcc)
{
    Bool res = False;

    CT_DEBUGIF(3) {
	CT_DEBUG(2, "+ print_fn_pos: ");
	print_cxt(16, bbcc->cxt, bbcc->rec_index);
    }

    if (!clo_mangle_names) {
	if (last->rec_index != bbcc->rec_index) {
	    VG_(sprintf)(outbuf, "rec=%d\n\n", bbcc->rec_index);
	    fwrite(fd, (void*)outbuf, VG_(strlen)(outbuf));
	    last->rec_index = bbcc->rec_index;
	    last->cxt = 0; /* reprint context */
	    res = True;
	}
	
	if (last->cxt != bbcc->cxt) {
	    fn_node* last_from = (last->cxt && last->cxt->size>1) ?
				 last->cxt->fn[1] : 0;
	    fn_node* curr_from = (bbcc->cxt && bbcc->cxt->size>1) ?
				 bbcc->cxt->fn[1] : 0;
	    if (curr_from == 0) {
		if (last_from != 0) {
		    /* switch back to no context */
		    VG_(sprintf)(outbuf, "frfn=(spontaneous)\n");
		    fwrite(fd, (void*)outbuf, VG_(strlen)(outbuf));
		    res = True;
		}
	    }
	    else if (last_from != curr_from) {
		print_fn(fd,outbuf,"frfn", curr_from);
		res = True;
	    }
	    last->cxt = bbcc->cxt;
	}
    }

    if (last->obj != bbcc->cxt->fn[0]->file->obj) {
	VG_(sprintf)(outbuf, "ob=");
	print_obj(outbuf+3, bbcc->cxt->fn[0]->file->obj);
	fwrite(fd, (void*)outbuf, VG_(strlen)(outbuf));
	last->obj = bbcc->cxt->fn[0]->file->obj;
	res = True;
    }

    if (last->file != bbcc->cxt->fn[0]->file) {
	VG_(sprintf)(outbuf, "fl=");
	print_file(outbuf+3, bbcc->cxt->fn[0]->file);
	fwrite(fd, (void*)outbuf, VG_(strlen)(outbuf));
	last->file = bbcc->cxt->fn[0]->file;
	res = True;
    }

    if (!clo_mangle_names) {
	if (last->fn != bbcc->cxt->fn[0]) {
	    print_fn(fd,outbuf, "fn", bbcc->cxt->fn[0]);
	    last->fn = bbcc->cxt->fn[0];
	    res = True;
	}
    }
    else {
	/* Print mangled name if context or rec_index changes */
	if ((last->rec_index != bbcc->rec_index) ||
	    (last->cxt != bbcc->cxt)) {

	    print_mangled_fn(fd, outbuf, "fn", bbcc->cxt, bbcc->rec_index);
	    last->fn = bbcc->cxt->fn[0];
	    last->rec_index = bbcc->rec_index;
	    res = True;
	}
    }

    last->cxt = bbcc->cxt;

    CT_DEBUG(2, "- print_fn_pos: %s\n", res ? "changed" : "");
    
    return res;
}

/* Address position inside of a BBCC:
 * This includes
 * - the address offset from the BB start address
 * - file/line from debug info for that address (can change inside a BB)
 */
typedef struct _APos APos;
struct _APos {
    Addr addr;
    Addr bb_addr;
    file_node* file;
    int line;
};

/* the debug lookup cache is useful if BBCC for same BB are
 * dumped directly in a row. This is a direct mapped cache.
 */
#define DEBUG_CACHE_SIZE 1777

static Addr       debug_cache_addr[DEBUG_CACHE_SIZE];
static file_node* debug_cache_file[DEBUG_CACHE_SIZE];
static int        debug_cache_line[DEBUG_CACHE_SIZE];
static Bool       debug_cache_info[DEBUG_CACHE_SIZE];

static __inline__
void init_debug_cache()
{
    int i;
    for(i=0;i<DEBUG_CACHE_SIZE;i++) {
	debug_cache_addr[i] = 0;
	debug_cache_file[i] = 0;
	debug_cache_line[i] = 0;
	debug_cache_info[i] = 0;
    }
}

static __inline__
Bool get_debug_pos(BBCC* bbcc, Addr addr, APos* p)
{
    Char file[FILENAME_LEN];
    Bool res;

    int cachepos = addr % DEBUG_CACHE_SIZE;
    
    if (debug_cache_addr[cachepos] == addr) {
	p->line = debug_cache_line[cachepos];
	p->file = debug_cache_file[cachepos];
	res     = debug_cache_info[cachepos];
    }
    else {
	res = VG_(get_filename_linenum)(addr, file,
					FILENAME_LEN, &(p->line));
	if (!res) {
	    VG_(strcpy)(file, "???");
	    p->line = 0;
	}
	p->file    = get_file_node(bbcc->bb->obj, file);

	debug_cache_info[cachepos] = res;
	debug_cache_addr[cachepos] = addr;
	debug_cache_line[cachepos] = p->line;
	debug_cache_file[cachepos] = p->file;
    }

    /* Address offset from bbcc start address */
    p->addr    = addr - bbcc->bb->obj->offset;
    p->bb_addr = bbcc->bb->addr - bbcc->bb->obj->offset;

    CT_DEBUG(3, "  get_debug_pos(0x%x): BB 0x%x, fn '%s', file '%s', line %d\n",
		addr, bbcc->bb->addr, bbcc->cxt->fn[0]->name,
		p->file->name, p->line);

    return res;
}


/* copy file position and init cost */
static void init_apos(APos* p, Addr addr, Addr bb_addr, file_node* file)
{
    p->addr    = addr;
    p->bb_addr = bb_addr;
    p->file    = file;
    p->line    = 0;
}

static void copy_apos(APos* dst, APos* src)
{
    dst->addr    = src->addr;
    dst->bb_addr = src->bb_addr;
    dst->file    = src->file;
    dst->line    = src->line;
}   

/* a FCC entity that can be written out in one line
 * with position inside a function
 */
typedef struct _FCost FCost;
struct _FCost {
    APos p;
    fCC fcc;
};

/* copy file position and init cost */
static void init_fcost(FCost* c, Addr addr, Addr bb_addr, file_node* file)
{
    init_apos( &(c->p), addr, bb_addr, file);
    init_fcc( &(c->fcc) );
}


/**
 * print position change inside of a BB (last -> curr)
 * this doesn't update last to curr!
 */
static void fprint_apos(Int fd, APos* curr, APos* last, file_node* func_file)
{
    CT_ASSERT(curr->file != 0);
    CT_DEBUG(2, "    print_apos(file '%s', line %d, bb %p, addr %p) fnFile '%s'\n",
	     curr->file->name, curr->line, curr->bb_addr, curr->addr,
	     func_file->name);

    if (curr->file != last->file) {

	/* if we switch back to orig file, use fe=... */
	if (curr->file == func_file)
	    VG_(sprintf)(outbuf, "fe=");
	else
	    VG_(sprintf)(outbuf, "fi=");
	print_file(outbuf+3, curr->file);
	fwrite(fd, (void*)outbuf, VG_(strlen)(outbuf));
    }

    if (clo_dump_bbs) {
	if (curr->line != last->line) {
	    VG_(sprintf)(outbuf, "ln=%d\n", curr->line);
	    fwrite(fd, (void*)outbuf, VG_(strlen)(outbuf));
	}
    }
}

/**
 * Print a position.
 * This prints out differences if allowed
 *
 * This doesn't set last to curr afterwards!
 */
static
void fprint_pos(Int fd, APos* curr, APos* last)
{
    if (clo_dump_bbs)
	VG_(sprintf)(outbuf, "%u ", curr->addr - curr->bb_addr);
    else {
	int p = 0;
	if (clo_dump_instr) {
	    int diff = curr->addr - last->addr;
	    if ( clo_compress_pos && (last->addr >0) && 
		 (diff > -100) && (diff < 100)) {
		if (diff >0)
		    p = VG_(sprintf)(outbuf, "+%d ", diff);
		else if (diff==0)
		    p = VG_(sprintf)(outbuf, "* ");
	        else
		    p = VG_(sprintf)(outbuf, "%d ", diff);
	    }
	    else
		p = VG_(sprintf)(outbuf, "%p ", curr->addr);
	}

	if (clo_dump_bb) {
	    int diff = curr->bb_addr - last->bb_addr;
	    if ( clo_compress_pos && (last->bb_addr >0) && 
		 (diff > -100) && (diff < 100)) {
		if (diff >0)
		    p += VG_(sprintf)(outbuf+p, "+%d ", diff);
		else if (diff==0)
		    p += VG_(sprintf)(outbuf+p, "* ");
	        else
		    p += VG_(sprintf)(outbuf+p, "%d ", diff);
	    }
	    else
		p += VG_(sprintf)(outbuf+p, "%p ", curr->bb_addr);
	}

	if (clo_dump_line) {
	    int diff = curr->line - last->line;
	    if ( clo_compress_pos && (last->line >0) && 
		 (diff > -100) && (diff < 100)) {

		if (diff >0)
		    VG_(sprintf)(outbuf+p, "+%d ", diff);
		else if (diff==0)
		    VG_(sprintf)(outbuf+p, "* ");
	        else
		    VG_(sprintf)(outbuf+p, "%d ", diff);
	    }
	    else
		VG_(sprintf)(outbuf+p, "%u ", curr->line);
	}
    }
    fwrite(fd, (void*)outbuf, VG_(strlen)(outbuf));
}


/**
 * Print events.
 * This does compression if allowed
 */
#define COMPRESS_FCC_SIZE 5

static fCC comp_fcc[COMPRESS_FCC_SIZE];
static Int comp_next[COMPRESS_FCC_SIZE], comp_start, comp_used;

static __inline__
void init_comp_fcc()
{
    int i;

    if (!clo_compress_events) return;

    for(i=0;i<COMPRESS_FCC_SIZE;i++) {
	init_fcc( &(comp_fcc[i]) );
	comp_next[i] = -1;
    }
    comp_start = -1;
    comp_used = 0;
}

static
void fprint_fcc(int fd, fCC* fcc)
{
    int i, j, last;

    if (!clo_compress_events) {
	int p = sprint_fcc(outbuf, fcc);
	VG_(sprintf)(outbuf+p, "\n");
	fwrite(fd, (void*)outbuf, VG_(strlen)(outbuf));
	return;
    }

    CT_DEBUGIF(1) {
	CT_DEBUG(1, "fprint_fcc: ");
	print_fcc(12, fcc);	
    }

    last = -1;
    for(j=0,i=comp_start;
	(i>=0) && (j<COMPRESS_FCC_SIZE);
	j++,last=i,i=comp_next[last]) {

	if (is_equal_fcc(fcc, comp_fcc+i)) {
	    if (j==0)
		VG_(sprintf)(outbuf, "*\n");
	    else
		VG_(sprintf)(outbuf, "-%d\n",j);
	    fwrite(fd, (void*)outbuf, VG_(strlen)(outbuf));
	    
	    if (j>0) {
		/* set index <i> on top of list */
		comp_next[last] = comp_next[i];
		comp_next[i]    = comp_start;
		comp_start      = i;
	    }

	    CT_DEBUGIF(1) {
		CT_DEBUG(1, "  FOUND at %d (%d.), used %d, start %d\n",
			 i, j+1, comp_used, comp_start);
		for(i=0;i<COMPRESS_FCC_SIZE;i++) {
		    CT_DEBUG(1, "   Idx %d (next %d): ", i, comp_next[i]);
		    print_fcc(12, comp_fcc+i);
		}
	    }
	    return;
	}
    }

    i = sprint_fcc(outbuf, fcc);
    VG_(sprintf)(outbuf+i, "\n");
    fwrite(fd, (void*)outbuf, VG_(strlen)(outbuf));
	
    if (comp_used < COMPRESS_FCC_SIZE) {
	last = comp_used;
	comp_used++;
    }

    copy_fcc(comp_fcc+last, fcc);
    comp_next[last] = comp_start;
    comp_start      = last;

    CT_DEBUGIF(1) {
	CT_DEBUG(1, "  PUT at %d, used %d, start %d\n",
		 last, comp_used, comp_start);
	for(i=0;i<COMPRESS_FCC_SIZE;i++) {
	    CT_DEBUG(1, "   Idx %d (next %d): ", i, comp_next[i]);
	    print_fcc(12, comp_fcc+i);
	}
    }
}


/* Write the cost of a source line; only that parts of the source
 * position are written that changed relative to last written position.
 * funcPos is the source position of the first line of actual function.
 * Something is written only if cost != 0; returns True in this case.
 */
static void fprint_fcost(Int fd, FCost* c, APos* last)
{
    CT_DEBUGIF(3) {
	CT_DEBUG(2, "   print_fcost(file '%s', line %d, bb %p, addr %p):\n",
		     c->p.file->name, c->p.line, c->p.bb_addr, c->p.addr);
	print_fcc(-5, &(c->fcc));
   }

    fprint_pos(fd, &(c->p), last);
    copy_apos( last, &(c->p) ); /* update last to current position */

    fprint_fcc(fd, &(c->fcc));

    /* add cost to total */
    add_fcc( &dump_total_fcc, &(c->fcc) );
}


/* Write out the calls from jcc (at pos)
 */
static void fprint_jcc(Int fd, jCC* jcc, APos* curr, APos* last)
{
    static APos target;
    file_node* file;
    obj_node*  obj;

    CT_DEBUG(2, "  fprint_jcc: from 0x%x (jkind %d) to fn '%s'/rec %d\n",
	     jcc->from->bb->jmp_addr, jcc->jmpkind,
	     jcc->to->cxt->fn[0]->name, jcc->to->rec_index);

    if (!get_debug_pos(jcc->to, jcc->to->bb->addr, &target)) {
	/* if we don't have debug info, don't switch to file "???" */
	target.file = last->file;
    }

    if (jcc->from &&
	(jcc->jmpkind == JmpCond || jcc->jmpkind == JmpBoring)) {
	    
	/* this is a JCC for a followed conditional or boring jump. */
	CT_ASSERT(is_zero_fcc(&(jcc->sum)));
	
	/* objects among jumps should be the same.
	 * Otherwise this jump would have been changed to a call
	 *  (see setup_bbcc)
	 */
	CT_ASSERT(jcc->from->bb->obj == jcc->to->bb->obj);

	/* only print if target position info is usefull */
	if (!clo_dump_instr && !clo_dump_bb && target.line==0) {
	    jcc->call_counter = 0;
	    return;
	}

	/* Different files/functions are possible e.g. with longjmp's
	 * which change the stack, and thus context
	 */
	if (last->file != target.file) {
	    VG_(sprintf)(outbuf, "jfi=");
	    print_file(outbuf+4, target.file);
	    fwrite(fd, (void*)outbuf, VG_(strlen)(outbuf));
	}
	
	if (jcc->from->cxt != jcc->to->cxt) {
	    if (clo_mangle_names)
		print_mangled_fn(fd, outbuf, "jfn",
				 jcc->to->cxt, jcc->to->rec_index);
	    else
		print_fn(fd, outbuf, "jfn", jcc->to->cxt->fn[0]);
	}
	    
	if (jcc->jmpkind == JmpCond) {
	    /* format: jcnd=<followed>/<executions> <target> */
	    VG_(sprintf)(outbuf, "jcnd=%llu/%llu ",
			 jcc->call_counter, jcc->from->exe_counter);
	}
	else {
	    /* format: jump=<jump count> <target> */
	    VG_(sprintf)(outbuf, "jump=%llu ",
			 jcc->call_counter);
	}
	fwrite(fd, (void*)outbuf, VG_(strlen)(outbuf));
		
	fprint_pos(fd, &target, last);
	fwrite(fd, "\n", 1);
	fprint_pos(fd, curr, last);
	fwrite(fd, "\n", 1);

	jcc->call_counter = 0;
	return;
    }

    CT_ASSERT(jcc->to !=0);
    
    file = jcc->to->cxt->fn[0]->file;
    obj  = jcc->to->bb->obj;
    
    /* object of called position different to object of this function?*/
    if (jcc->from->cxt->fn[0]->file->obj != obj) {
	VG_(sprintf)(outbuf, "cob=");
	print_obj(outbuf+4, obj);
	fwrite(fd, (void*)outbuf, VG_(strlen)(outbuf));
    }

    /* file of called position different to current file? */
    if (last->file != file) {
	VG_(sprintf)(outbuf, "cfi=");
	print_file(outbuf+4, file);
	fwrite(fd, (void*)outbuf, VG_(strlen)(outbuf));
    }

    if (clo_mangle_names)
	print_mangled_fn(fd, outbuf, "cfn", jcc->to->cxt, jcc->to->rec_index);
    else
	print_fn(fd, outbuf, "cfn", jcc->to->cxt->fn[0]);

    if (!is_zero_fcc(&(jcc->sum))) {
	VG_(sprintf)(outbuf, "calls=%llu ",  jcc->call_counter);
	fwrite(fd, (void*)outbuf, VG_(strlen)(outbuf));

	fprint_pos(fd, &target, last);
	fwrite(fd, "\n", 1);	

	fprint_pos(fd, curr, last);
	fprint_fcc(fd, &(jcc->sum));

	init_fcc( &(jcc->sum) );

	jcc->call_counter = 0;
    }
}


/* Cost summation of functions.We use alternately ccSum[0/1], thus
 * ssSum[currSum] for recently read lines with same line number.
 */
static FCost ccSum[2];
static int currSum;

/*
 * Print all costs of a BBCC:
 * - FCCs of instructions
 * - JCCs of the unique jump of this BB
 * returns True if something was written 
 */
static Bool fprint_bbcc(Int fd, BBCC* bbcc, APos* last)
{
   Addr BBCC_ptr0, BBCC_ptr;

   Bool something_written = False;
   jCC* jcc;
   FCost *currCost, *newCost;
   UInt array_size = bbcc->bb->array_size;
   UInt jcc_count = 0, instr_count = 0;

   CT_ASSERT(bbcc->cxt != 0);
   CT_DEBUG(1, "+ fprint_bbcc: BB 0x%x, Cxt %d "
	       "(fn '%s', rec %d)\n", 
	       bbcc->bb->addr,
	       bbcc->cxt->base_number + bbcc->rec_index,
	       bbcc->cxt->fn[0]->name,
	       bbcc->rec_index);

   CT_ASSERT(currSum == 0 || currSum == 1);
   currCost = &(ccSum[currSum]);
   newCost  = &(ccSum[1-currSum]);

   BBCC_ptr0 = BBCC_ptr = (Addr)(bbcc->array);
   
   while (BBCC_ptr - BBCC_ptr0 < array_size) {

       /* get debug info of current instruction address and dump cost
	* if clo_dump_bbs or file/line has changed
	*/
       Addr instr_addr;

       instr_count++;
       
       /* We pretend the CC is an iCC for getting the tag.  This is ok
	* because both CC types have tag as their first byte.  Once we know
	* the type, we can cast and act appropriately. */
       switch ( ((iCC*)BBCC_ptr)->tag ) {
       case InstrCC:
	   instr_addr = ((iCC*)BBCC_ptr)->instr_addr;
	   break;
       case ReadCC:
       case ModCC:
       case WriteCC:
	   instr_addr = ((idCC*)BBCC_ptr)->instr_addr;
	   break;
       case ReadWriteCC:
	   instr_addr = ((iddCC*)BBCC_ptr)->instr_addr;
	   break;
       default:
	   VG_(skin_panic)("Unknown CC type in fprint_bbcc()\n");
	   break;
       }
       
       if (!get_debug_pos(bbcc, instr_addr, &(newCost->p))) {
	   /* if we don't have debug info, don't switch to file "???" */
	   newCost->p.file = bbcc->cxt->fn[0]->file;
       }

       if (clo_dump_bbs || clo_dump_instr ||
	   (newCost->p.line != currCost->p.line) ||
	   (newCost->p.file != currCost->p.file)) {

	   if (!is_zero_fcc( &(currCost->fcc) )) {
	       something_written = True;
	       
	       fprint_apos(fd, &(currCost->p), last, bbcc->cxt->fn[0]->file);
	       fprint_fcost(fd, currCost, last);
	   }
	   
	   /* switch buffers */
	   currSum = 1 - currSum;
	   currCost = &(ccSum[currSum]);
	   newCost  = &(ccSum[1-currSum]);
       }
       
       /* add line cost to current cost sum */

       /* FIXME: Is this too simple ? */
       if (!clo_simulate_cache) {
	   currCost->fcc.Ir.a += bbcc->exe_counter;
	   switch ( ((iCC*)BBCC_ptr)->tag ) {
	       case InstrCC: BBCC_ptr += sizeof(iCC); break;
	       case ReadCC:
	       case ModCC:
	       case WriteCC: BBCC_ptr += sizeof(idCC); break;
	       case ReadWriteCC: BBCC_ptr += sizeof(iddCC); break;
	       default:
		   VG_(skin_panic)("Unknown CC type in fprint_bbcc()\n");
		   break;
	   }
	   continue;
       }
       
       switch ( ((iCC*)BBCC_ptr)->tag ) {
	   
       case InstrCC:
	   add_and_zero_cc( &(currCost->fcc.Ir), &( ((iCC*)BBCC_ptr)->I) );
	   BBCC_ptr += sizeof(iCC);
	   break;
	   
       case ReadCC:
       case  ModCC:
	   {
	       idCC* cc = (idCC*)BBCC_ptr;
	       
	       add_and_zero_cc( &(currCost->fcc.Ir), &cc->I);
	       add_and_zero_cc( &(currCost->fcc.Dr), &cc->D);
	   }
	   BBCC_ptr += sizeof(idCC);
	   break;
	   
       case WriteCC:
	   {
	       idCC* cc = (idCC*)BBCC_ptr;
	       
	       add_and_zero_cc( &(currCost->fcc.Ir), &cc->I);
	       add_and_zero_cc( &(currCost->fcc.Dw), &cc->D);
	   }
	   BBCC_ptr += sizeof(idCC);
	   break;
	   
       case ReadWriteCC:
	   {
	       iddCC* cc = (iddCC*)BBCC_ptr;
	       
	       add_and_zero_cc( &(currCost->fcc.Ir), &cc->I);
	       /* Da is read, Db is write access */
	       add_and_zero_cc( &(currCost->fcc.Dr), &cc->Da);
	       add_and_zero_cc( &(currCost->fcc.Dw), &cc->Db);
	   }
	   BBCC_ptr += sizeof(iddCC);
	   break;
	   
       default:
	   VG_(skin_panic)("Unknown CC type in fprint_bbcc()\n");
	   break;
       }
   }
   
   /* Some JCC output? If yes, dump cumulated line info first */
   for(jcc=bbcc->jcc_list; jcc; jcc=jcc->next_from) {
       /* yes, if JCC only counts jmp arcs or cost >0 */
       if ( ((jcc->jmpkind != JmpCall) && (jcc->call_counter>0)) ||
	    (!is_zero_fcc(&(jcc->sum))))
	   jcc_count++;
   }

   if ( (bbcc->skipped && !is_zero_fcc(&(bbcc->skipped->fcc))) || 
	(jcc_count>0) ) {

       if (!is_zero_fcc( &(currCost->fcc) )) {
	   /* no need to switch buffers, as position is the same */
	   fprint_apos(fd, &(currCost->p), last, bbcc->cxt->fn[0]->file);
	   fprint_fcost(fd, currCost, last);
       }

       get_debug_pos(bbcc, bbcc->bb->jmp_addr, &(currCost->p));
       fprint_apos(fd, &(currCost->p), last, bbcc->cxt->fn[0]->file);
       something_written = True;

       /* first, print skipped costs for calls */
       if (bbcc->skipped && !is_zero_fcc(&(bbcc->skipped->fcc))) {
	   add_fcc( &(currCost->fcc), &(bbcc->skipped->fcc) );
	   if (clo_dump_skipped)
	       print_fn(fd, outbuf, "csk", bbcc->skipped->fn);
	   fprint_fcost(fd, currCost, last);
       }

       if (jcc_count > 0)
	   for(jcc=bbcc->jcc_list; jcc; jcc=jcc->next_from)
	       if ( ((jcc->jmpkind != JmpCall) && (jcc->call_counter>0)) ||
		    (!is_zero_fcc(&(jcc->sum))))
		   
		   fprint_jcc(fd, jcc, &(currCost->p), last);
   }
   
   if (clo_dump_bbs || clo_dump_bb) {
       if (!is_zero_fcc( &(currCost->fcc) )) {
	   something_written = True;

	   fprint_apos(fd, &(currCost->p), last, bbcc->cxt->fn[0]->file);
	   fprint_fcost(fd, currCost, last);
       }
       if (clo_dump_bbs) fwrite(fd, (void*)"\n", 1);

       /* when every cost was immediatly written, we must have done so,
	* as this function is only called when there's cost in a BBCC
	*/
       CT_ASSERT(something_written);
   }

   bbcc->exe_counter = 0;
   bbcc->ret_counter = 0;

   CT_ASSERT(BBCC_ptr - BBCC_ptr0 == array_size);

   CT_DEBUG(1, "- fprint_bbcc: Instr %d, JCCs %d\n",
	       instr_count, jcc_count);
   
   
   return something_written;
}



/* order by
 *  recursion,
 *  from->bb->obj, from->bb->fn
 *  obj, fn[0]->file, fn
 *  address
 */
static int my_cmp(BBCC** pbbcc1, BBCC** pbbcc2)
{
#if 0
    return (*pbbcc1)->bb->addr - (*pbbcc2)->bb->addr;
#else
    BBCC *bbcc1 = *pbbcc1;
    BBCC *bbcc2 = *pbbcc2;
    Context* cxt1 = bbcc1->cxt;
    Context* cxt2 = bbcc2->cxt;
    int off = 1;

    if (cxt1->fn[0]->file->obj != cxt2->fn[0]->file->obj)
	return cxt1->fn[0]->file->obj - cxt2->fn[0]->file->obj;

    if (cxt1->fn[0]->file != cxt2->fn[0]->file)
	return cxt1->fn[0]->file - cxt2->fn[0]->file;

    if (cxt1->fn[0] != cxt2->fn[0])
	return cxt1->fn[0] - cxt2->fn[0];

    if (bbcc1->rec_index != bbcc2->rec_index)
	return bbcc1->rec_index - bbcc2->rec_index;

    while((off < cxt1->size) && (off < cxt2->size)) {
	fn_node* ffn1 = cxt1->fn[off];
	fn_node* ffn2 = cxt2->fn[off];
	if (ffn1->file->obj != ffn2->file->obj)
	    return ffn1->file->obj - ffn2->file->obj;
	if (ffn1 != ffn2)
	    return ffn1 - ffn2;
	off++;
    }
    if      (cxt1->size > cxt2->size) return 1;
    else if (cxt1->size < cxt2->size) return -1;

    return bbcc1->bb->addr - bbcc2->bb->addr;
#endif
}





/* modified version of:
 *
 * qsort -- qsort interface implemented by faster quicksort.
 * J. L. Bentley and M. D. McIlroy, SPE 23 (1993) 1249-1265.
 * Copyright 1993, John Wiley.
*/

static __inline__
void swapfunc(BBCC** a, BBCC** b, int n)
{
    while(n>0) {
	BBCC* t = *a; *a = *b; *b = t;
	a++, b++;
	n--;
    }
}

static __inline__
void swap(BBCC** a, BBCC** b)
{
    BBCC* t;
    t = *a; *a = *b; *b = t;
}

#define min(x, y) ((x)<=(y) ? (x) : (y))

static BBCC** med3(BBCC **a, BBCC **b, BBCC **c, int (*cmp)())
{	return cmp(a, b) < 0 ?
		  (cmp(b, c) < 0 ? b : cmp(a, c) < 0 ? c : a)
		: (cmp(b, c) > 0 ? b : cmp(a, c) > 0 ? c : a);
}

static BBCC** qsort_start = 0;

static void qsort(BBCC **a, int n, int (*cmp)(BBCC**,BBCC**))
{
	BBCC **pa, **pb, **pc, **pd, **pl, **pm, **pn, **pv;
	int s, r;
	BBCC* v;

	CT_DEBUG(4, "  qsort(%d,%d)\n", a-qsort_start, n);

	if (n < 7) {	 /* Insertion sort on smallest arrays */
		for (pm = a+1; pm < a+n; pm++)
			for (pl = pm; pl > a && cmp(pl-1, pl) > 0; pl --)
				swap(pl, pl-1);

		CT_DEBUGIF(4) {
		    for (pm = a; pm < a+n; pm++) {
			VG_(printf)("   %3d BB 0x%x, ", pm - qsort_start,
				    (*pm)->bb->addr);      
			print_cxt(9, (*pm)->cxt, (*pm)->rec_index);
		    }
		}
		return;
	}
	pm = a + n/2;    /* Small arrays, middle element */
	if (n > 7) {
		pl = a;
		pn = a + (n-1);
		if (n > 40) {    /* Big arrays, pseudomedian of 9 */
			s = n/8;
			pl = med3(pl, pl+s, pl+2*s, cmp);
			pm = med3(pm-s, pm, pm+s, cmp);
			pn = med3(pn-2*s, pn-s, pn, cmp);
		}
		pm = med3(pl, pm, pn, cmp); /* Mid-size, med of 3 */
	}


	v = *pm;
	pv = &v;
	pa = pb = a;
	pc = pd = a + (n-1);
	for (;;) {
		while ((pb <= pc) && ((r=cmp(pb, pv)) <= 0)) {
		    if (r==0) {
			/* same as pivot, to start */
			swap(pa,pb); pa++; 
		    }
		    pb ++;
		}
		while ((pb <= pc) && ((r=cmp(pc, pv)) >= 0)) {
		    if (r==0) {
			/* same as pivot, to end */
			swap(pc,pd); pd--; 
		    }
		    pc --;
		}
		if (pb > pc) { break; }
		swap(pb, pc);
		pb ++;
		pc --;
	}
	pb--;
	pc++;

	/* put pivot from start into middle */
	if ((s = pa-a)>0) { for(r=0;r<s;r++) swap(a+r, pb+1-s+r); }
	/* put pivot from end into middle */
	if ((s = a+n-1-pd)>0) { for(r=0;r<s;r++) swap(pc+r, a+n-s+r); }	    

	CT_DEBUGIF(4) {
	    VG_(printf)("   PV BB 0x%x, ", (*pv)->bb->addr);      
	    print_cxt(9, (*pv)->cxt, (*pv)->rec_index);

	    s = pb-pa+1;
	    VG_(printf)("    Lower %d - %d:\n", a-qsort_start, a+s-1-qsort_start);
	    for (r=0;r<s;r++) {
		pm = a+r;
		VG_(printf)("     %3d BB 0x%x, ", pm-qsort_start,(*pm)->bb->addr);
		print_cxt(9, (*pm)->cxt, (*pm)->rec_index);
	    }

	    s = pd-pc+1;
	    VG_(printf)("    Upper %d - %d:\n", 
			a+n-s-qsort_start, a+n-1-qsort_start);
	    for (r=0;r<s;r++) {
		pm = a+n-s+r;
		VG_(printf)("     %3d BB 0x%x, ", pm-qsort_start,(*pm)->bb->addr);
		print_cxt(9, (*pm)->cxt, (*pm)->rec_index);
	    }
	}

	if ((s = pb+1-pa) > 1) qsort(a,     s, cmp);
	if ((s = pd+1-pc) > 1) qsort(a+n-s, s, cmp);
}



/*------------------------------------------------------------*/
/*--- SK_(fini)() and related function                     ---*/
/*------------------------------------------------------------*/



/**
 * Put all BBCCs with costs into a sorted array.
 * The returned arrays ends with a null pointer. 
 * Must be freed after dumping.
 */
static BBCC** prepare_dump()
{
    BBCC *bbcc, *bbcc2, **array, **p;
    int bbcc_count = 0;
    int i,j,t;

    /* count number of BBCCs with >0 executions */
    for (i = 0; i < bbcc_table_size; i++) {
	if ((bbcc=bbcc_table[i]) == NULL) continue;
	while (bbcc) {
	    /* every bbcc should have a rec_array */
	    CT_ASSERT(bbcc->rec_array != 0);

	    for(j=0;j<bbcc->cxt->fn[0]->fn_recursion;j++) {
		if ((bbcc2 = bbcc->rec_array[j]) == 0) continue;
		if (bbcc2->exe_counter>0 || bbcc2->ret_counter>0)
			bbcc_count++;
	    }
	    bbcc = bbcc->next1;
	}
    }
    
    if (clo_dump_threads) {
	/* add BBCCs with active call in call stack of current thread.
	 * update cost sums for active calls
	 */
	for(i = 0; i < call_stack_sp; i++) {
	    if (call_stack[i].jcc == 0) continue;

	    add_diff_fcc( &(call_stack[i].jcc->sum),
			  &(call_stack[i].fcc), current_fcc);
	    bbcc = call_stack[i].jcc->from;

	    CT_DEBUG(1, " [%2d] (tid %d), added active: %s\n",
			i,current_tid,bbcc->cxt->fn[0]->name);

	    if (bbcc->exe_counter>0 || bbcc->ret_counter>0) {
		/* already counted */
		continue;
	    }
	    bbcc_count++;
	}
    }
    else {
	/* go through call stacks of all threads */
	for(t=0;t<VG_N_THREADS;t++) {
	    if (!thread[t]) continue;
	    if (t == current_tid) {
		for(i = 0; i < call_stack_sp; i++) {
		    if (call_stack[i].jcc == 0) continue;     
		    add_diff_fcc( &(call_stack[i].jcc->sum),
				  &(call_stack[i].fcc), current_fcc);

		    bbcc = call_stack[i].jcc->from;

		    CT_DEBUG(1, " [%2d] (tid %d), added active: %s\n",
				i,t,bbcc->cxt->fn[0]->name);

		    if (bbcc->exe_counter>0 || bbcc->ret_counter>0) {
			/* already counted */
			continue;
		    }
		    bbcc_count++;
		}
	    }
	    else {
		for(i = 0; i < thread[t]->stack_sp; i++) {
		    if (thread[t]->stack[i].jcc == 0) continue;     
		    add_diff_fcc( &(thread[t]->stack[i].jcc->sum),
				  &(thread[t]->stack[i].fcc),
				  &(thread[t]->cxt_stack[0]->current) );
		    bbcc = thread[t]->stack[i].jcc->from;

		    CT_DEBUG(1, " [%2d] (tid %d), added active: %s\n",
				i,t,bbcc->cxt->fn[0]->name);

		    if (bbcc->exe_counter>0 || bbcc->ret_counter>0) {
			/* already counted */
			continue;
		    }
		    bbcc_count++;
		}
	    }
	}
    }


    CT_DEBUG(0, "prepare_dump: %d BBCCs\n", bbcc_count);

    /* allocate bbcc array, insert BBCCs and sort */
    p = array = (BBCC**) VG_(malloc)((bbcc_count+1) * sizeof(BBCC*));    

    for (i = 0; i < bbcc_table_size; i++) {
	if ((bbcc=bbcc_table[i]) == NULL) continue;
	while (bbcc) {
	    for(j=0;j<bbcc->cxt->fn[0]->fn_recursion;j++) {
		if ((bbcc2 = bbcc->rec_array[j]) == 0) continue;
		if ((bbcc2->exe_counter == 0) && 
		    (bbcc2->ret_counter == 0)) continue;
		*p = bbcc2;
		p++;
	    }
	    bbcc = bbcc->next1;
	}
    }

    if (clo_dump_threads) {
	/* add BBCCs with active call in call stack of current thread.
	 */
	for(i = 0; i < call_stack_sp; i++) {
	    if (call_stack[i].jcc == 0) continue;

	    bbcc = call_stack[i].jcc->from;
	    if ((bbcc->exe_counter>0) || (bbcc->ret_counter>0)) continue;

	    *p = bbcc;
	    p++;
	}
    }
    else {
	/* go through call stacks of all threads */
	for(t=0;t<VG_N_THREADS;t++) {
	    if (!thread[t]) continue;
	    if (t == current_tid) {
		for(i = 0; i < call_stack_sp; i++) {
		    if (call_stack[i].jcc == 0) continue;     
		    bbcc = call_stack[i].jcc->from;
		    if ((bbcc->exe_counter>0) ||
			(bbcc->ret_counter>0)) continue;
		    
		    *p = bbcc;
		    p++;
		}
	    }
	    else {
		for(i = 0; i < thread[t]->stack_sp; i++) {
		    if (thread[t]->stack[i].jcc == 0) continue;     
		    bbcc = thread[t]->stack[i].jcc->from;
		    if ((bbcc->exe_counter>0) ||
			(bbcc->ret_counter>0)) continue;

		    *p = bbcc;
		    p++;
		}
	    }
	}
    }

    CT_ASSERT(array + bbcc_count == p);

    /* end mark */
    *p = 0;

    CT_DEBUG(0,"             BBCCs inserted\n");

    qsort_start = array;
    qsort(array, bbcc_count, my_cmp);

    CT_DEBUG(0,"             BBCCs sorted\n");

    return array;
}




static void fprint_fcc_ln(int fd, Char* prefix, fCC* fcc)
{
    int p;

    p = VG_(sprintf)(outbuf, "%s", prefix);
    p += sprint_fcc(outbuf + p, fcc);
    VG_(sprintf)(outbuf + p, "\n");
    fwrite(fd, (void*)outbuf, VG_(strlen)(outbuf));
}

static ULong last_bbs_done = 0;
static Char* filename = 0;

static void file_err()
{
   VG_(message)(Vg_UserMsg,
                "error: can't open cache simulation output file `%s'",
                filename );
   VG_(exit)(1);
}

/**
 * Create a new dump file and write header.
 *
 * Naming: <clo_filename_base>.<pid>[.<part>][-<tid>]
 *         <part> is skipped for final dump (trigger==0)
 *         <tid>  is skipped for thread 1 with clo_dump_threads=no
 *
 * Returns the file descriptor, and -1 on error (no write permission)
 */
static int new_dumpfile(Char buf[BUF_LEN], int tid, Char* trigger)
{
    Bool appending = False;
    int i, fd;
    fCC sum;

    CT_ASSERT(filename != 0);

    if (clo_separate_dumps) {
	i = VG_(sprintf)(filename, "%s.%d", dump_file_base, VG_(getpid)());
    
	if (trigger)
	    i += VG_(sprintf)(filename+i, ".%d", out_counter);

	if (clo_dump_threads)
	    i += VG_(sprintf)(filename+i, "-%02d", tid);

	fd = VG_(open)(filename, VKI_O_WRONLY|VKI_O_TRUNC, 0);
    }
    else {
	VG_(sprintf)(filename, "%s.%d", dump_file_base, VG_(getpid)());
	fd = VG_(open)(filename, VKI_O_WRONLY|VKI_O_APPEND, 0);
	if ((fd >= 0) && out_counter>1)
	    appending = True;
    }

    if (fd <0) {
	fd = VG_(open)(filename, VKI_O_CREAT|VKI_O_WRONLY,
		       VKI_S_IRUSR|VKI_S_IWUSR);
	if (fd <0) {
	    /* If the file can't be opened for whatever reason (conflict
	       between multiple supervised processes?), give up now. */
	    file_err();
	    VGP_POPCC(VgpCacheDump);
	    return -1;
	}
    }

    if (!appending)
	reset_dump_array();


    if (!appending) {
	/* version */
	VG_(sprintf)(buf, "version: " VERSION "\n");
	fwrite(fd, (void*)buf, VG_(strlen)(buf));

	/* "pid:" line */
	VG_(sprintf)(buf, "pid: %d\n", VG_(getpid)());
	fwrite(fd, (void*)buf, VG_(strlen)(buf));

	/* "cmd:" line */
	VG_(strcpy)(buf, "cmd:");
	fwrite(fd, (void*)buf, VG_(strlen)(buf));
	for (i = 0; i < VG_(client_argc); i++) {
	    VG_(sprintf)(buf, " %s", VG_(client_argv[i]));
	    fwrite(fd, (void*)buf, VG_(strlen)(buf));
	}
    }

    VG_(sprintf)(buf, "\npart: %d\n", out_counter);
    fwrite(fd, (void*)buf, VG_(strlen)(buf));
    if (clo_dump_threads) {
	VG_(sprintf)(buf, "thread: %d\n", tid);
	fwrite(fd, (void*)buf, VG_(strlen)(buf));
    }

    /* "desc:" lines */
    if (!appending) {
	/* Global options changing the tracing behaviour */
	VG_(sprintf)(buf, "\ndesc: Option: --skip-plt=%s\n",
		     clo_skip_plt ? "yes" : "no");
	fwrite(fd, (void*)buf, VG_(strlen)(buf));
	VG_(sprintf)(buf, "desc: Option: --trace-jump=%s\n",
		     clo_trace_jump ? "yes" : "no");
	fwrite(fd, (void*)buf, VG_(strlen)(buf));
	VG_(sprintf)(buf, "desc: Option: --fn-recursion=%d\n",
		     clo_fn_recursion);
	fwrite(fd, (void*)buf, VG_(strlen)(buf));
	VG_(sprintf)(buf, "desc: Option: --fn-caller=%d\n",
		     clo_fn_caller);
	fwrite(fd, (void*)buf, VG_(strlen)(buf));

#if 0
	VG_(sprintf)(buf, "desc: Option: --dump-bbs=%s\n",
		     clo_dump_bbs ? "yes" : "no");
	fwrite(fd, (void*)buf, VG_(strlen)(buf));
	VG_(sprintf)(buf, "desc: Option: --dump-threads=%s\n",
		     clo_dump_threads ? "yes" : "no");
	fwrite(fd, (void*)buf, VG_(strlen)(buf));
#endif

	VG_(sprintf)(buf, "\ndesc: I1 cache: %s\n", I1.desc_line);
	fwrite(fd, (void*)buf, VG_(strlen)(buf));
	VG_(sprintf)(buf, "desc: D1 cache: %s\n", D1.desc_line);
	fwrite(fd, (void*)buf, VG_(strlen)(buf));
	VG_(sprintf)(buf, "desc: L2 cache: %s\n", L2.desc_line);
	fwrite(fd, (void*)buf, VG_(strlen)(buf));
    }

    VG_(sprintf)(buf, "\ndesc: Timerange: Basic block %llu - %llu\n",
		 last_bbs_done, VG_(bbs_done));
    fwrite(fd, (void*)buf, VG_(strlen)(buf));
    VG_(sprintf)(buf, "desc: Trigger: %s\n",
		 trigger ? trigger : (Char*)"Program termination");
    fwrite(fd, (void*)buf, VG_(strlen)(buf));

#if 0
   /* Output function specific config
    * FIXME */
   for (i = 0; i < N_FNCONFIG_ENTRIES; i++) {
       fnc = fnc_table[i];
       while (fnc) {
	   if (fnc->skip) {
	       VG_(sprintf)(buf, "desc: Option: --fn-skip=%s\n", fnc->name);
	       fwrite(fd, (void*)buf, VG_(strlen)(buf));
	   }
	   if (fnc->dump_at_enter) {
	       VG_(sprintf)(buf, "desc: Option: --fn-dump-at-enter=%s\n",
			    fnc->name);
	       fwrite(fd, (void*)buf, VG_(strlen)(buf));
	   }   
	   if (fnc->dump_at_leave) {
	       VG_(sprintf)(buf, "desc: Option: --fn-dump-at-leave=%s\n",
			    fnc->name);
	       fwrite(fd, (void*)buf, VG_(strlen)(buf));
	   }
	   if (fnc->fn_caller != clo_fn_caller) {
	       VG_(sprintf)(buf, "desc: Option: --fn-caller%d=%s\n",
			    fnc->fn_caller, fnc->name);
	       fwrite(fd, (void*)buf, VG_(strlen)(buf));
	   }   
	   if (fnc->fn_recursion != clo_fn_recursion) {
	       VG_(sprintf)(buf, "desc: Option: --fn-recursion%d=%s\n",
			    fnc->fn_recursion, fnc->name);
	       fwrite(fd, (void*)buf, VG_(strlen)(buf));
	   }   
	   fnc = fnc->next;
       }
   }
#endif

   /* "positions:" line */
   VG_(sprintf)(buf, "\npositions:%s%s%s\n",
		clo_dump_instr ? " instr" : "",
		clo_dump_bb    ? " bb" : "",
		clo_dump_line  ? " line" : "");
   fwrite(fd, (void*)buf, VG_(strlen)(buf));

   /* "events:" line */
   if (!clo_simulate_cache)
       VG_(sprintf)(buf, "events: Ir\n");
   else
       VG_(sprintf)(buf, "events: Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw\n");
   fwrite(fd, (void*)buf, VG_(strlen)(buf));

   /* summary lines */
   init_fcc( &sum );
   if (clo_dump_threads)
       add_diff_fcc(&sum, &(thread[tid]->last),
		    &(thread[tid]->cxt_stack[0]->current));
   else {
       /* This function is called once for thread 1, where
	* all costs are summed up when not dumping separate per thread.
	* But this is not true for summary: we need to add all threads.
	*/
       int t;
       for(t=1;t<VG_N_THREADS;t++) {
	   if (!thread[t]) continue;
	   add_diff_fcc(&sum,
			&(thread[t]->last),
			&(thread[t]->cxt_stack[0]->current));
       }
   }
   fprint_fcc_ln(fd, "summary: ", &sum);

   /* all dumped cost will be added to total_fcc */
   init_fcc( &dump_total_fcc );

   fwrite(fd, "\n\n",2);

   if (VG_(clo_verbosity) > 1)
       VG_(message)(Vg_DebugMsg, "  Dumping to %s...", filename);

   return fd;
}


static void close_dumpfile(Char buf[BUF_LEN], int fd, int tid)
{
    if (fd <0) return;

    /* FIXME: with --dump-threads=no add all threads! */
    /* Print stats from any discarded basic blocks */
    if (!is_zero_fcc( &(thread[tid]->discards) )) {
	VG_(sprintf)(buf, "fl=(discarded)\n");
	fwrite(fd, (void*)buf, VG_(strlen)(buf));
	VG_(sprintf)(buf, "fn=(discarded)\n");
	fwrite(fd, (void*)buf, VG_(strlen)(buf));
	
	/* Use 0 as line number */
	fprint_fcc_ln(fd, "0 ", &(thread[tid]->discards));

	add_fcc( &dump_total_fcc, &(thread[tid]->discards));
    }

    fprint_fcc_ln(fd, "totals: ", &dump_total_fcc);
    //fprint_fcc_ln(fd, "summary: ", &dump_total_fcc);
    add_fcc(&total_fcc, &dump_total_fcc);

    fwrite_flush();    
    VG_(close)(fd);

    if (filename[0] == '.') {
	if (-1 == VG_(rename) (filename, filename+1)) {
	    /* Can't rename to correct file name: give out warning */
	    VG_(message)(Vg_DebugMsg, "Warning: Can't rename .%s to %s",
			 filename, filename);
       }
   }

    if (VG_(clo_verbosity) > 1)
	VG_(message)(Vg_DebugMsg, "  Finished dump.");
}


static void print_bbccs(Char* trigger, Bool only_current_thread)
{
    BBCC **p, **array;
    FPos lastFPos;
    APos lastAPos;
    Char buf[BUF_LEN];
    int fd = -1;
    int orig_tid = current_tid;
    int t;

    
    if (!clo_dump_threads) only_current_thread = True;

    init_dump_array();
    init_debug_cache();

    for(t=1;t<VG_N_THREADS;t++) {
	if (!thread[t]) continue;
	if (only_current_thread && (t != orig_tid)) continue;

	fd = new_dumpfile(buf, t, trigger);
	if (fd <0) continue;

	switch_thread(t);
	p = array = prepare_dump();

	init_fpos(&lastFPos);
	init_apos(&lastAPos, 0, 0, 0);
	init_comp_fcc();

	while(1) {

	    /* on context/function change, print old cost buffer before */
	    if (lastFPos.cxt && ((*p==0) ||				 
				 (lastFPos.cxt != (*p)->cxt) ||
				 (lastFPos.rec_index != (*p)->rec_index))) {
		if (!is_zero_fcc( &(ccSum[currSum].fcc) )) {
		    /* no need to switch buffers, as position is the same */
		    fprint_apos(fd, &(ccSum[currSum].p), &lastAPos,
				lastFPos.cxt->fn[0]->file);
		    fprint_fcost(fd, &ccSum[currSum], &lastAPos);
		}
		if (ccSum[currSum].p.file != lastFPos.cxt->fn[0]->file) {
		    /* switch back to file of function */
		    VG_(sprintf)(buf, "fe=");
		    print_file(buf+3, lastFPos.cxt->fn[0]->file);
		    fwrite(fd, (void*)buf, VG_(strlen)(buf));
		}
		fwrite(fd, "\n", 1);
	    }

	    if (*p == 0) break;

	    if (print_fn_pos(fd, &lastFPos, *p)) {

		/* new function */
		init_apos(&lastAPos, 0, 0, (*p)->cxt->fn[0]->file);
		init_fcost(&ccSum[0], 0, 0, 0);
		init_fcost(&ccSum[1], 0, 0, 0);
		init_comp_fcc();
		currSum = 0;
	    }

	    if (clo_dump_bbs) {
		/* FIXME: Specify Object of BB if different to object of fn */
		VG_(sprintf)(buf, "bb=0x%x %d %llu\n", 
			     (*p)->bb->addr - (*p)->bb->obj->offset,
			     (*p)->bb->size,
			     (*p)->exe_counter);
		fwrite(fd, (void*)buf, VG_(strlen)(buf));
	    }

	    fprint_bbcc(fd, *p, &lastAPos);

	    p++;
	}

	close_dumpfile(buf, fd, t);
	VG_(free)(array);

	/* set counters of last dump */
	copy_fcc( &(thread[t]->last), current_fcc );
    }

    free_dump_array();

    if (current_tid != orig_tid) switch_thread(orig_tid);
}




static void cachesim_flush(Char* trigger, Bool only_current_thread)
{
   VGP_PUSHCC(VgpCacheDump);

   if (VG_(clo_verbosity) > 1)
       VG_(message)(Vg_DebugMsg, "Prepare dump at BB %llu (%s)...",
		    VG_(bbs_done), trigger ? trigger : (Char*)"Prg.Term.");

   out_counter++;

   print_bbccs(trigger, only_current_thread);

   last_bbs_done = VG_(bbs_done);

   if (VG_(clo_verbosity) > 1)
     VG_(message)(Vg_DebugMsg, "... finished dumping ");

   VGP_POPCC(VgpCacheDump);
}

/*
 * Zero all costs of a BBCC
 */
static void zero_bbcc(BBCC* bbcc)
{
   Addr BBCC_ptr0, BBCC_ptr;
   jCC* jcc;
   UInt array_size = bbcc->bb->array_size;

   CT_ASSERT(bbcc->cxt != 0);
   CT_DEBUG(1, "  zero_bbcc: BB 0x%x, Cxt %d "
	       "(fn '%s', rec %d)\n", 
	       bbcc->bb->addr,
	       bbcc->cxt->base_number + bbcc->rec_index,
	       bbcc->cxt->fn[0]->name,
	       bbcc->rec_index);

   BBCC_ptr0 = BBCC_ptr = (Addr)(bbcc->array);   
   while (BBCC_ptr - BBCC_ptr0 < array_size) {

       switch ( ((iCC*)BBCC_ptr)->tag ) {
	   
       case InstrCC:
	   init_cc( &( ((iCC*)BBCC_ptr)->I) );
	   BBCC_ptr += sizeof(iCC);
	   break;
	   
       case ReadCC:
       case  ModCC:
	   {
	       idCC* cc = (idCC*)BBCC_ptr;
	       
	       init_cc( &cc->I);
	       init_cc( &cc->D);
	   }
	   BBCC_ptr += sizeof(idCC);
	   break;
	   
       case WriteCC:
	   {
	       idCC* cc = (idCC*)BBCC_ptr;
	       
	       init_cc( &cc->I);
	       init_cc( &cc->D);
	   }
	   BBCC_ptr += sizeof(idCC);
	   break;
	   
       case ReadWriteCC:
	   {
	       iddCC* cc = (iddCC*)BBCC_ptr;
	       
	       init_cc( &cc->I);
	       init_cc( &cc->Da);
	       init_cc( &cc->Db);
	   }
	   BBCC_ptr += sizeof(iddCC);
	   break;
	   
       default:
	   VG_(skin_panic)("Unknown CC type in zero_bbcc()\n");
	   break;
       }
   }
   
   for(jcc=bbcc->jcc_list; jcc; jcc=jcc->next_from) {
       
       init_fcc( &(jcc->sum) );
       jcc->call_counter = 0;
   }

   bbcc->exe_counter = 0;
   bbcc->ret_counter = 0;
   CT_ASSERT(BBCC_ptr - BBCC_ptr0 == array_size);
}


static void zero_ccs(Bool only_current_thread)
{
    BBCC *bbcc, *bbcc2;
    int orig_tid = current_tid;
    int t, i, j;

   if (VG_(clo_verbosity) > 1)
       VG_(message)(Vg_DebugMsg, "  Zeroing costs...");

    for(t=1;t<VG_N_THREADS;t++) {
	if (!thread[t]) continue;
	if (only_current_thread && (t != orig_tid)) continue;
	switch_thread(t);
	
	for (i = 0; i < bbcc_table_size; i++) {
	    if ((bbcc=bbcc_table[i]) == NULL) continue;
	    while (bbcc) {
		/* every bbcc should have a rec_array */
		CT_ASSERT(bbcc->rec_array != 0);

		for(j=0;j<bbcc->cxt->fn[0]->fn_recursion;j++) {
		    if ((bbcc2 = bbcc->rec_array[j]) == 0) continue;
		    if ((bbcc2->exe_counter>0) ||
			(bbcc2->ret_counter>0))
			zero_bbcc(bbcc2);
		}
		bbcc = bbcc->next1;
	    }
	}

	for(i = 0; i < call_stack_sp; i++) {
	    if (!call_stack[i].jcc) continue;
	    bbcc = call_stack[i].jcc->from;

	    /* reset call counters to current for active calls */
	    copy_fcc( &(call_stack[i].fcc), current_fcc );

	    if ((bbcc->exe_counter>0) ||
		(bbcc->ret_counter>0)) continue; /* already zero'd */

	    zero_bbcc(bbcc);
	}

	/* set counter for last dump */
	copy_fcc( &(thread[t]->last), current_fcc );
    }
    if (current_tid != orig_tid) switch_thread(orig_tid);

   if (VG_(clo_verbosity) > 1)
       VG_(message)(Vg_DebugMsg, "  ...done");
}

#if VG_CORE_INTERFACE_MAJOR_VERSION < 3
Bool SK_(handle_client_request)(ThreadState* tst, UInt *args, UInt *ret)
#else
Bool SK_(handle_client_request)(ThreadId tid, UInt *args, UInt *ret)
#endif
{
   if (!VG_IS_SKIN_USERREQ('C','T',args[0]))
      return False;

   switch(args[0]) {
   case VG_USERREQ__DUMP_STATS:     
      cachesim_flush("Client Request", True);
      *ret = 0;                 /* meaningless */
      break;

   case VG_USERREQ__DUMP_STATS_AT:
     {
       Char buf[512];
       VG_(sprintf)(buf,"Client Request: %d", args[1]);
       cachesim_flush(buf, True);
       *ret = 0;                 /* meaningless */
     }
     break;

   case VG_USERREQ__ZERO_STATS:
      zero_ccs(True);
      *ret = 0;                 /* meaningless */
      break;

   case VG_USERREQ__TOGGLE_COLLECT:
      collect_state = !collect_state;
      CT_DEBUG(2, "User Request: toggled collection state to %s\n",
		  collect_state ? "ON" : "OFF");
      *ret = 0;                 /* meaningless */
      break;

   default:
      return False;
   }

   return True;
}


/* Adds commas to ULong, right justifying in a field field_width wide, returns
 * the string in buf. */
static
Int commify(ULong n, int field_width, char buf[COMMIFY_BUF_LEN])
{
   int len, n_commas, i, j, new_len, space;

   VG_(sprintf)(buf, "%llu", n);
   len = VG_(strlen)(buf);
   n_commas = (len - 1) / 3;
   new_len = len + n_commas;
   space = field_width - new_len;

   /* Allow for printing a number in a field_width smaller than it's size */
   if (space < 0) space = 0;    

   /* Make j = -1 because we copy the '\0' before doing the numbers in groups
    * of three. */
   for (j = -1, i = len ; i >= 0; i--) {
      buf[i + n_commas + space] = buf[i];

      if (3 == ++j) {
         j = 0;
         n_commas--;
         buf[i + n_commas + space] = ',';
      }
   }
   /* Right justify in field. */
   for (i = 0; i < space; i++)  buf[i] = ' ';
   return new_len;
}

static
void percentify(Int n, Int ex, Int field_width, char buf[]) 
{
   int i, len, space;
    
   VG_(sprintf)(buf, "%d.%d%%", n / ex, n % ex);
   len = VG_(strlen)(buf);
   space = field_width - len;
   if (space < 0) space = 0;     /* Allow for v. small field_width */
   i = len;

   /* Right justify in field */
   for (     ; i >= 0;    i--)  buf[i + space] = buf[i];
   for (i = 0; i < space; i++)  buf[i] = ' ';
}

static
void finish()
{
   CC D_total;
   ULong L2_total_m, L2_total_mr, L2_total_mw,
         L2_total, L2_total_r, L2_total_w;
   char buf1[RESULTS_BUF_LEN], 
        buf2[RESULTS_BUF_LEN], 
        buf3[RESULTS_BUF_LEN];
   Int l1, l2, l3;
   Int p, t;

   CT_DEBUG(0, "CallStack finalization:\n");

   /* pop all remaining items from CallStack for correct sum
    */
   for(t=1;t<VG_N_THREADS;t++) {
       if (!thread[t]) continue;
       switch_thread(t);

       /* unwind signal handlers */
       while(current_sigNum!=0)
	   post_signal(t,current_sigNum);

       /* unwind regular call stack */
       while(call_stack_sp>0) 
	   pop_call_stack();
   }
   /* No need to switch back to original thread, as prog has terminated
      if (current_tid != orig_tid) switch_thread(orig_tid);
   */


   cachesim_flush(0, False);

   /* unlink info file */
   if (info_file) VG_(unlink)(info_file);

   if (VG_(clo_verbosity) == 0) return;

   /* Hash table stats */
   if (VG_(clo_verbosity) > 1) {
       int BB_lookups = full_debug_BBs      + fn_name_debug_BBs +
                        file_line_debug_BBs + no_debug_BBs;
      
       VG_(message)(Vg_DebugMsg, "");
       VG_(message)(Vg_DebugMsg, "Distinct objects: %d", distinct_objs);
       VG_(message)(Vg_DebugMsg, "Distinct files:   %d", distinct_files);
       VG_(message)(Vg_DebugMsg, "Distinct fns:     %d", distinct_fns);
       VG_(message)(Vg_DebugMsg, "Distinct contexts:%d", distinct_contexts);
       VG_(message)(Vg_DebugMsg, "Distinct BBs:     %d", distinct_bbs);
       VG_(message)(Vg_DebugMsg, "Distinct BBCCs:   %d", distinct_bbccs);
       VG_(message)(Vg_DebugMsg, "Distinct skips:   %d", distinct_skips);
       VG_(message)(Vg_DebugMsg, "BB lookups:       %d", BB_lookups);
       VG_(message)(Vg_DebugMsg, "With full      debug info:%3d%% (%d)", 
                    full_debug_BBs    * 100 / BB_lookups,
                    full_debug_BBs);
       VG_(message)(Vg_DebugMsg, "With file/line debug info:%3d%% (%d)", 
                    file_line_debug_BBs * 100 / BB_lookups,
                    file_line_debug_BBs);
       VG_(message)(Vg_DebugMsg, "With fn name   debug info:%3d%% (%d)", 
                    fn_name_debug_BBs * 100 / BB_lookups,
                    fn_name_debug_BBs);
       VG_(message)(Vg_DebugMsg, "With no        debug info:%3d%% (%d)", 
                    no_debug_BBs      * 100 / BB_lookups,
                    no_debug_BBs);
       VG_(message)(Vg_DebugMsg, "BBCC Clones:       %d", bbcc_clones);
       VG_(message)(Vg_DebugMsg, "BBs Retranslated:  %d", BB_retranslations);
       VG_(message)(Vg_DebugMsg, "Distinct instrs:   %d", distinct_instrs);
       VG_(message)(Vg_DebugMsg, "");

       VG_(message)(Vg_DebugMsg, "LRU Contxt Misses: %d", cxt_lru_misses);
       VG_(message)(Vg_DebugMsg, "LRU BBCC Misses:   %d", bbcc_lru_misses);
       VG_(message)(Vg_DebugMsg, "LRU JCC Misses:    %d", jcc_lru_misses);
       VG_(message)(Vg_DebugMsg, "BBs Executed:      %llu", bb_executions);
       VG_(message)(Vg_DebugMsg, "Calls:             %llu", call_counter);
       VG_(message)(Vg_DebugMsg, "CondJMP followed:  %llu", jcnd_counter);
       VG_(message)(Vg_DebugMsg, "Boring JMPs:       %llu", jump_counter);
       VG_(message)(Vg_DebugMsg, "Recursive calls:   %llu", rec_call_counter);
       VG_(message)(Vg_DebugMsg, "Returns:           %llu", ret_counter);
#if HW_PREFETCH
       if (clo_simulate_hwpref) {
	 VG_(message)(Vg_DebugMsg, "Prefetch Up:       %llu", prefetch_up);
	 VG_(message)(Vg_DebugMsg, "Prefetch Down:     %llu", prefetch_down);
       }
#endif


       VG_(message)(Vg_DebugMsg, "");

       VG_(message)(Vg_DebugMsg, "BB hash entries:   %d (size %d, resizes %d)",
		    bb_table_entries, bb_table_size, bb_table_resizes);
       VG_(message)(Vg_DebugMsg, "BBCC hash entries: %d (size %d, resizes %d)",
		    bbcc_table_entries, bbcc_table_size, bbcc_table_resizes);
       VG_(message)(Vg_DebugMsg, "JCC hash entries:  %d (size %d, resizes %d)",
		    jcc_table_entries, jcc_table_size, jcc_table_resizes);
       VG_(message)(Vg_DebugMsg, "Cxt hash entries:  %d (size %d, resizes %d)",
		    cxt_table_entries, cxt_table_size, cxt_table_resizes);
       VG_(message)(Vg_DebugMsg, "FnInfo entries:    %d (size %d, resizes %d)",
		    fn_info_table_entries, fn_info_table_size,
		    fn_info_table_resizes);
       VG_(message)(Vg_DebugMsg, "FnActive entries:  %d (size %d, resized %d)",
		    distinct_fns, fn_active_array_size,
		    fn_active_array_resizes);
       VG_(message)(Vg_DebugMsg, "");
   }

   /* I cache results.  Use the I_refs value to determine the first column
    * width. */
   l1 = commify(total_fcc.Ir.a, 0, buf1);
   VG_(message)(Vg_UserMsg, "I   refs:      %s", buf1);

   if (!clo_simulate_cache) return;

   commify(total_fcc.Ir.m1, l1, buf1);
   VG_(message)(Vg_UserMsg, "I1  misses:    %s", buf1);

   commify(total_fcc.Ir.m2, l1, buf1);
   VG_(message)(Vg_UserMsg, "L2i misses:    %s", buf1);

   p = 100;

   if (0 == total_fcc.Ir.a) total_fcc.Ir.a = 1;
   percentify(total_fcc.Ir.m1 * 100 * p / total_fcc.Ir.a, p, l1+1, buf1);
   VG_(message)(Vg_UserMsg, "I1  miss rate: %s", buf1);
       
   percentify(total_fcc.Ir.m2 * 100 * p / total_fcc.Ir.a, p, l1+1, buf1);
   VG_(message)(Vg_UserMsg, "L2i miss rate: %s", buf1);
   VG_(message)(Vg_UserMsg, "");
   
   /* D cache results.  Use the D_refs.rd and D_refs.wr values to determine the
    * width of columns 2 & 3. */

   copy_cc( &D_total, &(total_fcc.Dr) );
   add_cc( &D_total, &(total_fcc.Dw) );    

        commify( D_total.a, l1, buf1);
   l2 = commify(total_fcc.Dr.a, 0,  buf2);
   l3 = commify(total_fcc.Dw.a, 0,  buf3);
   VG_(message)(Vg_UserMsg, "D   refs:      %s  (%s rd + %s wr)",
                buf1,  buf2,  buf3);

   commify( D_total.m1, l1, buf1);
   commify(total_fcc.Dr.m1, l2, buf2);
   commify(total_fcc.Dw.m1, l3, buf3);
   VG_(message)(Vg_UserMsg, "D1  misses:    %s  (%s rd + %s wr)",
                buf1, buf2, buf3);

   commify( D_total.m2, l1, buf1);
   commify(total_fcc.Dr.m2, l2, buf2);
   commify(total_fcc.Dw.m2, l3, buf3);
   VG_(message)(Vg_UserMsg, "L2d misses:    %s  (%s rd + %s wr)",
                buf1, buf2, buf3);

   p = 10;
   
   if (0 == D_total.a)   D_total.a = 1;
   if (0 == total_fcc.Dr.a) total_fcc.Dr.a = 1;
   if (0 == total_fcc.Dw.a) total_fcc.Dw.a = 1;
   percentify( D_total.m1 * 100 * p / D_total.a,  p, l1+1, buf1);
   percentify(total_fcc.Dr.m1 * 100 * p / total_fcc.Dr.a, p, l2+1, buf2);
   percentify(total_fcc.Dw.m1 * 100 * p / total_fcc.Dw.a, p, l3+1, buf3);
   VG_(message)(Vg_UserMsg, "D1  miss rate: %s (%s   + %s  )", buf1, buf2,buf3);

   percentify( D_total.m2 * 100 * p / D_total.a,  p, l1+1, buf1);
   percentify(total_fcc.Dr.m2 * 100 * p / total_fcc.Dr.a, p, l2+1, buf2);
   percentify(total_fcc.Dw.m2 * 100 * p / total_fcc.Dw.a, p, l3+1, buf3);
   VG_(message)(Vg_UserMsg, "L2d miss rate: %s (%s   + %s  )", buf1, buf2,buf3);
   VG_(message)(Vg_UserMsg, "");

   /* L2 overall results */

   L2_total   = total_fcc.Dr.m1 + total_fcc.Dw.m1 + total_fcc.Ir.m1;
   L2_total_r = total_fcc.Dr.m1 + total_fcc.Ir.m1;
   L2_total_w = total_fcc.Dw.m1;
   commify(L2_total,   l1, buf1);
   commify(L2_total_r, l2, buf2);
   commify(L2_total_w, l3, buf3);
   VG_(message)(Vg_UserMsg, "L2 refs:       %s  (%s rd + %s wr)",
                buf1, buf2, buf3);

   L2_total_m  = total_fcc.Dr.m2 + total_fcc.Dw.m2 + total_fcc.Ir.m2;
   L2_total_mr = total_fcc.Dr.m2 + total_fcc.Ir.m2;
   L2_total_mw = total_fcc.Dw.m2;
   commify(L2_total_m,  l1, buf1);
   commify(L2_total_mr, l2, buf2);
   commify(L2_total_mw, l3, buf3);
   VG_(message)(Vg_UserMsg, "L2 misses:     %s  (%s rd + %s wr)",
                buf1, buf2, buf3);

   percentify(L2_total_m  * 100 * p / (total_fcc.Ir.a + D_total.a),  p, l1+1, buf1);
   percentify(L2_total_mr * 100 * p / (total_fcc.Ir.a + total_fcc.Dr.a), p, l2+1, buf2);
   percentify(L2_total_mw * 100 * p / total_fcc.Dw.a, p, l3+1, buf3);
   VG_(message)(Vg_UserMsg, "L2 miss rate:  %s (%s   + %s  )", buf1, buf2,buf3);

}


void SK_(fini)(Int exitcode)
{
  finish();
}

/* Called when a translation is invalidated due to self-modifying code or
 * unloaded of a shared object.
 *
 * Finds the BBCC in the table, removes it, adds the counts to the discard
 * counters, and then frees the BBCC. */
void SK_(discard_basic_block_info) ( Addr a, UInt size )
{
   BBCC *BBCC_node;
   Bool BB_seen_before;
    
   if (0)
      VG_(printf)( "discard_basic_block_info: addr %x, size %u\n", a, size);

   /* 2nd arg won't be used since BB should have been seen before (assertions
    * ensure this). */
   BBCC_node = get_bbcc(a, NULL, /*remove=*/True, &BB_seen_before);
   CT_ASSERT(True == BB_seen_before);

   /* We don't free the BBCCs here: JCCs could reference it. (TODO/FIXME)
    * cost can be dumped later, no need for special discards events
   add_BBCC(BBCC_node, &discards_fcc);
   VG_(free)(BBCC_node);
   */
}

/*--------------------------------------------------------------------*/
/*--- Command line processing                                      ---*/
/*--------------------------------------------------------------------*/

static void parse_cache_opt ( cache_t* cache, char* orig_opt, int opt_len )
{
   int   i1, i2, i3;
   int   i;
   char *opt = VG_(strdup)(orig_opt);

   i = i1 = opt_len;

   /* Option looks like "--I1=65536,2,64".
    * Find commas, replace with NULs to make three independent 
    * strings, then extract numbers.  Yuck. */
   while (VG_(isdigit)(opt[i])) i++;
   if (',' == opt[i]) {
      opt[i++] = '\0';
      i2 = i;
   } else goto bad;
   while (VG_(isdigit)(opt[i])) i++;
   if (',' == opt[i]) {
      opt[i++] = '\0';
      i3 = i;
   } else goto bad;
   while (VG_(isdigit)(opt[i])) i++;
   if ('\0' != opt[i]) goto bad;

   cache->size      = (Int)VG_(atoll)(opt + i1);
   cache->assoc     = (Int)VG_(atoll)(opt + i2);
   cache->line_size = (Int)VG_(atoll)(opt + i3);

   VG_(free)(opt);

   return;

  bad:
   VG_(bad_option)(orig_opt);
}

static Char* getUInt(Char* s, UInt* pn)
{
    UInt n = 0;
    while((*s >='0') && (*s <='9')) {
	n = 10*n + (*s-'0');
	s++;
    }
    if (pn) *pn = n;
    return s;
}

Bool SK_(process_cmd_line_option)(Char* arg)
{
   if (0 == VG_(strcmp)(arg, "--skip-plt=yes"))
       clo_skip_plt = True;
   else if (0 == VG_(strcmp)(arg, "--skip-plt=no"))
       clo_skip_plt = False;

   else if (0 == VG_(strcmp)(arg, "--trace-jump=yes"))
       clo_trace_jump = True;
   else if (0 == VG_(strcmp)(arg, "--trace-jump=no"))
       clo_trace_jump = False;

   else if (0 == VG_(strcmp)(arg, "--separate-dumps=yes"))
       clo_separate_dumps = True;
   else if (0 == VG_(strcmp)(arg, "--separate-dumps=no"))
       clo_separate_dumps = False;

   else if (0 == VG_(strcmp)(arg, "--simulate-cache=yes"))
       clo_simulate_cache = True;
   else if (0 == VG_(strcmp)(arg, "--simulate-cache=no"))
       clo_simulate_cache = False;

   else if (0 == VG_(strcmp)(arg, "--simulate-hwpref=yes"))
       clo_simulate_hwpref = True;
   else if (0 == VG_(strcmp)(arg, "--simulate-hwpref=no"))
       clo_simulate_hwpref = False;

   else if (0 == VG_(strcmp)(arg, "--collect-state=yes"))
       clo_collect_state = True;
   else if (0 == VG_(strcmp)(arg, "--collect-state=no"))
       clo_collect_state = False;

   else if (0 == VG_(strcmp)(arg, "--dump-threads=yes"))
       clo_dump_threads = True;
   else if (0 == VG_(strcmp)(arg, "--dump-threads=no"))
       clo_dump_threads = False;

   else if (0 == VG_(strcmp)(arg, "--compress-strings=yes"))
       clo_compress_strings = True;
   else if (0 == VG_(strcmp)(arg, "--compress-strings=no"))
       clo_compress_strings = False;

   else if (0 == VG_(strcmp)(arg, "--compress-mangled=yes"))
       clo_compress_mangled = True;
   else if (0 == VG_(strcmp)(arg, "--compress-mangled=no"))
       clo_compress_mangled = False;

   else if (0 == VG_(strcmp)(arg, "--compress-events=yes"))
       clo_compress_events = True;
   else if (0 == VG_(strcmp)(arg, "--compress-events=no"))
       clo_compress_events = False;

   else if (0 == VG_(strncmp)(arg, "--fn-skip=", 10)) {
       fn_config* fnc = get_fnc(arg+10);
       fnc->skip = CONFIG_TRUE;
   }

   else if (0 == VG_(strncmp)(arg, "--dump-before=", 14)) {
       fn_config* fnc = get_fnc(arg+14);
       fnc->dump_before = CONFIG_TRUE;
   }

   else if (0 == VG_(strncmp)(arg, "--zero-before=", 14)) {
       fn_config* fnc = get_fnc(arg+14);
       fnc->zero_before = CONFIG_TRUE;
   }

   else if (0 == VG_(strncmp)(arg, "--dump-after=", 13)) {
       fn_config* fnc = get_fnc(arg+13);
       fnc->dump_after = CONFIG_TRUE;
   }

   else if (0 == VG_(strncmp)(arg, "--toggle-collect=", 17)) {
       fn_config* fnc = get_fnc(arg+17);
       fnc->toggle_collect = CONFIG_TRUE;
       /* defaults to initial collection off */
       clo_collect_state = False;
   }

   else if (0 == VG_(strncmp)(arg, "--fn-recursion=", 15))
        clo_fn_recursion = (Int)VG_(atoll)(&arg[15]);

#if JCC_DEBUG
   else if (0 == VG_(strncmp)(arg, "--ct-verbose=", 13))
        clo_ct_verbose = (Int)VG_(atoll)(&arg[13]);

   else if (0 == VG_(strncmp)(arg, "--ct-vstart=", 12))
        clo_ct_verbose_start = (ULong)VG_(atoll)(&arg[12]);

   else if (0 == VG_(strncmp)(arg, "--ct-verbose", 12)) {
       UInt n;
       fn_config* fnc;
       Char* s = getUInt(arg+12, &n);
       if ((n == 0) || *s != '=') return False;
       fnc = get_fnc(s+1);
       fnc->verbosity = n;
   }
#endif

   else if (0 == VG_(strncmp)(arg, "--fn-caller=", 12))
       clo_fn_caller = (Int)VG_(atoll)(&arg[12]);

   else if (0 == VG_(strncmp)(arg, "--fn-group", 10)) {
       UInt n;
       fn_config* fnc;
       Char* s = getUInt(arg+10, &n);
       if ((n == 0) || *s != '=') return False;
       fnc = get_fnc(s+1);
       fnc->group = n;
   }

   else if (0 == VG_(strncmp)(arg, "--fn-caller", 11)) {
       UInt n;
       fn_config* fnc;
       Char* s = getUInt(arg+11, &n);
       if ((n == 0) || *s != '=') return False;
       fnc = get_fnc(s+1);
       fnc->fn_caller = n;
   }

   else if (0 == VG_(strncmp)(arg, "--fn-recursion", 14)) {
       UInt n;
       fn_config* fnc;
       Char* s = getUInt(arg+14, &n);
       if ((n == 0) || *s != '=') return False;
       fnc = get_fnc(s+1);
       fnc->fn_recursion = n;
   }

   else if (0 == VG_(strncmp)(arg, "--base=", 7))
       clo_filename_base = VG_(strdup)(arg+7);

   else if (0 == VG_(strcmp)(arg, "--mangle-names=yes"))
       clo_mangle_names = True;
   else if (0 == VG_(strcmp)(arg, "--mangle-names=no"))
       clo_mangle_names = False;

   else if (0 == VG_(strcmp)(arg, "--skip-direct-rec=yes"))
       clo_skip_direct_recursion = True;
   else if (0 == VG_(strcmp)(arg, "--skip-direct-rec=no"))
       clo_skip_direct_recursion = False;

   else if (0 == VG_(strcmp)(arg, "--dump-bbs=yes"))
       clo_dump_bbs = True;
   else if (0 == VG_(strcmp)(arg, "--dump-bbs=no"))
       clo_dump_bbs = False;

   else if (0 == VG_(strcmp)(arg, "--dump-skipped=yes"))
       clo_dump_skipped = True;
   else if (0 == VG_(strcmp)(arg, "--dump-skipped=no"))
       clo_dump_skipped = False;

   else if (0 == VG_(strcmp)(arg, "--dump-line=yes"))
       clo_dump_line = True;
   else if (0 == VG_(strcmp)(arg, "--dump-line=no"))
       clo_dump_line = False;

   else if (0 == VG_(strcmp)(arg, "--dump-instr=yes"))
       clo_dump_instr = True;
   else if (0 == VG_(strcmp)(arg, "--dump-instr=no"))
       clo_dump_instr = False;

   else if (0 == VG_(strcmp)(arg, "--dump-bb=yes"))
       clo_dump_bb = True;
   else if (0 == VG_(strcmp)(arg, "--dump-bb=no"))
       clo_dump_bb = False;

   else if (0 == VG_(strcmp)(arg, "--compress-pos=yes"))
       clo_compress_pos = True;
   else if (0 == VG_(strcmp)(arg, "--compress-pos=no"))
       clo_compress_pos = False;

   else if (0 == VG_(strncmp)(arg, "--dumps=", 8))
        clo_dumps = (Int)VG_(atoll)(&arg[8]);

   /* 5 is length of "--I1=" */
   else if (0 == VG_(strncmp)(arg, "--I1=", 5))
      parse_cache_opt(&clo_I1_cache, arg,   5);
   else if (0 == VG_(strncmp)(arg, "--D1=", 5))
      parse_cache_opt(&clo_D1_cache, arg,   5);
   else if (0 == VG_(strncmp)(arg, "--L2=", 5))
      parse_cache_opt(&clo_L2_cache, arg,   5);
   else
      return False;

   return True;
}

void SK_(print_usage)(void)
{
   VG_(printf)(

"    --base=<prefix>           Prefix for profile files [" DEFAULT_DUMPNAME "]\n"
"    --separate-dumps=yes|no   Write each dump into separate file [yes]\n"
"    --simulate-cache=yes|no   Full cache simulation [yes]\n"
"    --simulate-hwpref=yes|no  Simulate Hardware Prefetch [no]\n"
"    --collect-state=yes|no    Start with collecting events [yes]\n"
"    --skip-plt=no|yes         Ignore calls to/from PLT sections? [yes]\n"
"    --trace-jump=no|yes       Trace (conditional) jumps in functions? [no]\n"
"    --fn-skip=<function>      Ignore calls to/from function?\n"
"    --fn-group<no>=<func>     Put function into separation group <no>\n"
"    --fn-recursion<no>=<func> Separate <no> recursions for function\n"
"    --fn-caller<no>=<func>    Separate <no> callers for function\n"
"    --dump-before=<func>      Dump when entering function\n"
"    --zero-before=<func>      Zero all costs when entering function\n"
"    --dump-after=<func>       Dump when leaving function\n"
"    --toggle-collect=<func>   Toggle collection on enter/leave function\n"
"    --fn-recursion=<level>    Separate function recursions, max level [2]\n"
"    --skip-direct-rec=no|yes  Ignore direct recursions? [yes]\n"
"    --fn-caller=<callers>     Separate functions by callers [0]\n"
"    --mangle-names=no|yes     Mangle separation into names? [yes]\n"
"    --dump-threads=no|yes     Dump traces per thread? [no]\n"
"    --compress-strings=no|yes Compress strings in profile dump? [yes]\n"
"    --compress-events=no|yes  Compress events in profile dump? [no]\n"
"    --compress-pos=no|yes     Compress positions in profile dump? [no]\n"
"    --dump-line=no|yes        Dump source lines of costs? [yes]\n"
"    --dump-instr=no|yes       Dump instruction address of costs? [no]\n"
"    --dump-bb=no|yes          Dump basic block address of costs? [no]\n"
"    --dump-bbs=no|yes         Dump basic block info? [no]\n"
"    --dump-skipped=no|yes     Dump info on skipped functions in calls? [no]\n"
"    --dumps=<count>           Dump trace each <count> BBs [0=never]\n"
"    --I1=<size>,<assoc>,<line_size>  set I1 cache manually\n"
"    --D1=<size>,<assoc>,<line_size>  set D1 cache manually\n"
"    --L2=<size>,<assoc>,<line_size>  set L2 cache manually\n"
"\n  see " VG_LIBDIR"/../share/doc/valgrind/ct_main.html for full documentation.\n\n"

    );
}

void SK_(print_debug_usage)(void)
{
    VG_(printf)(

#if JCC_DEBUG
"    --ct-verbose=<level>       Verbosity of standard debug output [0]\n"
"    --ct-vstart=<BB number>    Only be verbose after basic block [0]\n"
"    --ct-verbose<level>=<func> Verbosity while in <func>\n"
#else
"    (none)\n"
#endif

    );
}

/*--------------------------------------------------------------------*/
/*--- Setup                                                        ---*/
/*--------------------------------------------------------------------*/

void SK_(pre_clo_init)(void)
{
    VG_(details_name)            ("Calltree");
    VG_(details_version)         (VERSION);
    VG_(details_description)     ("a call-graph generating cache profiler");
    VG_(details_copyright_author)("Copyright (C) 2002-2004, and GNU GPL'd, "
				  "by N.Nethercote and J.Weidendorfer.");
    VG_(details_bug_reports_to)  ("Josef.Weidendorfer@gmx.de");
    VG_(details_avg_translation_sizeB) ( 155 );

    VG_(needs_basic_block_discards)();
    VG_(needs_command_line_options)();
    VG_(needs_client_requests)();

#if VG_CORE_INTERFACE_MAJOR_VERSION > 4
    VG_(init_thread_run) ( &thread_run );
    VG_(init_pre_deliver_signal)  ( &pre_signal );
    VG_(init_post_deliver_signal)  ( &post_signal );
#else
    VG_(track_thread_run) ( &thread_run );
    VG_(track_pre_deliver_signal)  ( &pre_signal );
    VG_(track_post_deliver_signal)  ( &post_signal );
#endif

    VG_(register_compact_helper)((Addr) & log_1I_0D_cache_access);
    VG_(register_compact_helper)((Addr) & log_1I_0D_cache_access_JIFZ);
    VG_(register_compact_helper)((Addr) & log_0I_1D_cache_access);
    VG_(register_compact_helper)((Addr) & log_1I_1D_cache_access);
    VG_(register_compact_helper)((Addr) & log_0I_2D_cache_access);
    VG_(register_compact_helper)((Addr) & log_1I_2D_cache_access);

    VG_(register_compact_helper)((Addr) & setup_bbcc);

    clo_separate_dumps   = True;
    clo_dump_threads     = False;
    clo_simulate_cache   = True;
    clo_simulate_hwpref  = False;
    clo_collect_state    = True;
    clo_mangle_names     = True;
    clo_skip_direct_recursion = False;
    clo_dumps            = 0;
    clo_fn_caller        = 0;
    clo_fn_recursion     = 2;
    clo_skip_plt         = True;
    clo_trace_jump       = False;

    /* dump options */
    clo_compress_strings = True;
    clo_compress_mangled = False;
    clo_compress_events  = False;
    clo_compress_pos     = False;
    clo_dump_line        = True;
    clo_dump_instr       = False;
    clo_dump_bb          = False;
    clo_dump_bbs         = False;
    clo_dump_skipped     = False;
}

void SK_(post_clo_init)(void)
{
   cache_t I1c, D1c, L2c; 
   Int fd, size;

   CT_DEBUG(1, "  dump threads: %s\n", clo_dump_threads ? "Yes":"No");
   CT_DEBUG(1, "  call sep. : %d\n", clo_fn_caller);
   CT_DEBUG(1, "  rec. sep. : %d\n", clo_fn_recursion);

   if (!clo_dump_line && !clo_dump_instr && !clo_dump_bb) {
       VG_(message)(Vg_UserMsg, "Using source line as position.");
       clo_dump_line = True;
   }

   if (!clo_filename_base) clo_filename_base = DEFAULT_DUMPNAME;

   /* get base directory for dump/command/result files */
   if (clo_filename_base[0] == '/') {
       int lastSlash = 0, i =1;
       while(clo_filename_base[i]) {
	   for(;clo_filename_base[i] && clo_filename_base[i] != '/';i++);
	   if (clo_filename_base[i] != '/') break;
	   lastSlash = i;
       }
       base_directory = VG_(malloc)(i+1);
       VG_(strncpy)(base_directory, clo_filename_base, i);
       base_directory[i] = 0;

       dump_file_base = clo_filename_base;
   }
   else {
       size = 100;
       base_directory = 0;

       /* getcwd() fails if the buffer isn't big enough -- keep doubling size
	  until it succeeds. */
       while (NULL == base_directory) {
	   base_directory = VG_(malloc)(size);
	   if (NULL == VG_(getcwd)(base_directory, size)) {
	       VG_(free)(base_directory);
	       base_directory = 0;
	       size *= 2;
	   }
       }

       size = VG_(strlen)(base_directory) + VG_(strlen)(clo_filename_base) +2;
       dump_file_base = VG_(malloc)(size);
       CT_ASSERT(dump_file_base != 0);
       VG_(sprintf)(dump_file_base, "%s/%s",
		    base_directory, clo_filename_base);
   }

   size = VG_(strlen)(base_directory) + VG_(strlen)(DEFAULT_COMMANDNAME) +10;
   command_file = VG_(malloc)(size);
   CT_ASSERT(command_file != 0);
   VG_(sprintf)(command_file, "%s/%s.%d",
		base_directory, DEFAULT_COMMANDNAME, VG_(getpid)());

   /* This is for compatibility with the "Force Now" Button of current
    * KCachegrind releases, as it doesn't use ".pid" to distinguish
    * different calltree instances from same base directory.
    * Should be removed sometimes in the future (29.10.03)
    */
   command_file2 = VG_(malloc)(size);
   CT_ASSERT(command_file2 != 0);
   VG_(sprintf)(command_file2, "%s/%s",
		base_directory, DEFAULT_COMMANDNAME);

   size = VG_(strlen)(base_directory) + VG_(strlen)(DEFAULT_RESULTNAME) +10;
   result_file = VG_(malloc)(size);
   CT_ASSERT(result_file != 0);
   VG_(sprintf)(result_file, "%s/%s.%d",
		base_directory, DEFAULT_RESULTNAME, VG_(getpid)());

   info_file = VG_(malloc)(VG_(strlen)(DEFAULT_INFONAME) + 10);
   CT_ASSERT(info_file != 0);
   VG_(sprintf)(info_file, "%s.%d", DEFAULT_INFONAME, VG_(getpid)());

   CT_DEBUG(1, "  dump file base: '%s'\n", dump_file_base);
   CT_DEBUG(1, "  command file:   '%s'\n", command_file);
   CT_DEBUG(1, "  result file:    '%s'\n", result_file);
   CT_DEBUG(1, "  info file:      '%s'\n", info_file);

   /* allocate space big enough for final filenames */
   filename = VG_(malloc)(VG_(strlen)(dump_file_base)+32);
   CT_ASSERT(filename != 0);
       
   /* Make sure the output base file can be written.
    * This is used for the dump at program termination.
    * We stop with an error here if we can't create the
    * file: This is probably because of missing rights,
    * and trace parts wouldn't be allowed to be written, too.
    */ 
    VG_(sprintf)(filename, "%s.%d", dump_file_base, VG_(getpid)());
    fd = VG_(open)(filename, VKI_O_WRONLY|VKI_O_TRUNC, 0);
    if (fd <0) { 
	fd = VG_(open)(filename, VKI_O_CREAT|VKI_O_WRONLY,
		       VKI_S_IRUSR|VKI_S_IWUSR);
	if (fd <0) {
	    file_err(); 
	}
    }
    if (fd>=0) VG_(close)(fd);

    /* create info file to indicate that we are running */ 
    fd = VG_(open)(info_file, VKI_O_WRONLY|VKI_O_TRUNC, 0);
    if (fd<0) { 
	fd = VG_(open)(info_file, VKI_O_CREAT|VKI_O_WRONLY,
		       VKI_S_IRUSR|VKI_S_IWUSR|
		       VKI_S_IRGRP|VKI_S_IWGRP|
		       VKI_S_IROTH|VKI_S_IWOTH);
	if (fd <0) {
	  VG_(message)(Vg_DebugMsg, 
		       "warning: can't write info file '%s'", info_file);
	  info_file = 0;
	  fd = -1;
	}
    }
    if (fd>=0) {
      Char buf[512];
      Int i;

      VG_(sprintf)(buf, 
		   "# This file is generated by Calltree-" VERSION ".\n"
		   "# It is used to enable controlling the supervision of\n"
		   "#  '%s'\n"
		   "# by external tools.\n\n", VG_(client_argv[0]));
      VG_(write)(fd, (void*)buf, VG_(strlen)(buf));

      VG_(sprintf)(buf, "version: " VERSION "\n");
      VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
    
      VG_(sprintf)(buf, "base: %s\n", base_directory);
      VG_(write)(fd, (void*)buf, VG_(strlen)(buf));

      VG_(sprintf)(buf, "dumps: %s\n", filename);
      VG_(write)(fd, (void*)buf, VG_(strlen)(buf));

      VG_(sprintf)(buf, "control: %s\n", command_file);
      VG_(write)(fd, (void*)buf, VG_(strlen)(buf));

      VG_(sprintf)(buf, "result: %s\n", result_file);
      VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
    
      VG_(strcpy)(buf, "cmd:");
      VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
      for (i = 0; i < VG_(client_argc); i++) {
	VG_(sprintf)(buf, " %s", VG_(client_argv[i]));
	VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
      }
      VG_(write)(fd, "\n", 1);
      VG_(close)(fd);
    }

   get_caches(&I1c, &D1c, &L2c);

   cachesim_I1_initcache(I1c);
   cachesim_D1_initcache(D1c);
   cachesim_L2_initcache(L2c);
#if HW_PREFETCH
   prefetch_init();
#endif

   VGP_(register_profile_event)(VgpCacheGetBBCC,  "cache-getBBCC");
   VGP_(register_profile_event)(VgpCacheDump,     "cache-dump");
   VGP_(register_profile_event)(VgpCacheSimulate, "cache-simulate");
   VGP_(register_profile_event)(VgpCacheResults,  "cache-results");
   VGP_(register_profile_event)(VgpCacheCallDepth,"cache-calldepth");
   VGP_(register_profile_event)(VgpCacheCalltree, "cache-calltree");
   VGP_(register_profile_event)(VgpCacheSetup,    "cache-setup");
   

   if (clo_simulate_cache) {       
       costtype_register("Ir",   True);
       costtype_register("I1mr", False);
       costtype_register("I2mr", False);
       costtype_register("Dr",   True);
       costtype_register("D1mr", False);
       costtype_register("D2mr", False);
       costtype_register("Dw",   True);
       costtype_register("D1mw", False);
       costtype_register("D2mw", False);
   }
   else
       costtype_register("Ir", False);

   init_fcc( &total_fcc );

   /* initialize hash tables */
   init_obj_table();
   init_cxt_table();
   init_bb_table();

   init_threads();   

   thread_run(1);
}

#if 0
Bool SK_(cheap_sanity_check)(void) { return True; }

extern TTEntry* vg_tt;

Bool SK_(expensive_sanity_check)(void)
{ 
   Int i;
   Bool dummy;
   for (i = 0; i < 200191; i++) {
      if (vg_tt[i].orig_addr != (Addr)1 &&
          vg_tt[i].orig_addr != (Addr)3) {
         VG_(printf)(".");
         get_BBCC(vg_tt[i].orig_addr, NULL, /*remove=*/True, &dummy);
      }
   }
   return True;
}
#endif

/*--------------------------------------------------------------------*/
/*--- end                                                cg_main.c ---*/
/*--------------------------------------------------------------------*/
