/**************************************************************************/ /* */ /* OCaml */ /* */ /* Jacques-Henri Jourdan, projet Gallium, INRIA Paris */ /* */ /* Copyright 2016 Institut National de Recherche en Informatique et */ /* en Automatique. */ /* */ /* All rights reserved. This file is distributed under the terms of */ /* the GNU Lesser General Public License version 2.1, with the */ /* special exception on linking described in the file LICENSE. */ /* */ /**************************************************************************/ #define CAML_INTERNALS #include #include "caml/memory.h" #include "caml/memprof.h" /* type aliases for the hierarchy of structures for managing memprof status. */ typedef struct memprof_domain_s memprof_domain_s, *memprof_domain_t; typedef struct memprof_thread_s memprof_thread_s, *memprof_thread_t; /* Per-thread memprof state. */ struct memprof_thread_s { /* [suspended] is used for inhibiting memprof callbacks when a callback is running or when an uncaught exception handler is called. */ bool suspended; /* TODO: More fields to add here */ /* Per-domain memprof information */ memprof_domain_t domain; /* Linked list of thread structures for this domain. Could use a * doubly-linked list for performance, but I haven't measured it. */ memprof_thread_t next; }; /* A memprof configuration is held in an object on the Caml * heap. These are getter macros for each field. */ #define Stopped(config) Bool_val(Field(config, 0)) #define Running(config) ((config != Val_unit) && !Stopped(config)) #define Lambda(config) Double_val(Field(config, 1)) #define One_log1m_lambda(config) Double_val(Field(config, 2)) #define Callstack_size(config) Int_val(Field(config, 3) #define Alloc_minor(config) Field(config, 4) #define Alloc_major(config) Field(config, 5) #define Promote(config) Field(config, 6) #define Dealloc_minor(config) Field(config, 7) #define Dealloc_major(config) Field(config, 8) /* The 'stopped' field is the only one we ever update. */ #define Set_stopped(config, flag) (Field(config, 0) = Val_bool(flag)) /* Per-domain memprof state */ struct memprof_domain_s { /* The owning domain */ caml_domain_state *caml_state; /* Linked list of threads in this domain */ memprof_thread_t threads; /* The current thread's memprof state. Note that there may not be a "current thread". TODO: maybe this shouldn't be nullable. Nullability costs us some effort and may be meaningless. See call site of caml_memprof_leave_thread() in st_stubs.c. */ memprof_thread_t current; /* TODO: More fields to add here */ /* The current profiling configuration for this domain. */ value config; }; /**** Create and destroy thread state structures ****/ static memprof_thread_t thread_create(memprof_domain_t domain) { memprof_thread_t thread = caml_stat_alloc(sizeof(memprof_thread_s)); if (!thread) { return NULL; } thread->suspended = false; /* attach to domain record */ thread->domain = domain; thread->next = domain->threads; domain->threads = thread; return thread; } static void thread_destroy(memprof_thread_t thread) { memprof_domain_t domain = thread->domain; if (domain->current == thread) { domain->current = NULL; } /* remove thread from the per-domain list. Could go faster if we * used a doubly-linked list, but that's premature optimisation * at this point. */ memprof_thread_t *p = &domain->threads; while (*p != thread) { p = &(*p)->next; } *p = thread->next; caml_stat_free(thread); } /**** Create and destroy domain state structures ****/ static void domain_destroy(memprof_domain_t domain) { memprof_thread_t thread = domain->threads; while (thread) { memprof_thread_t next = thread->next; thread_destroy(thread); thread = next; } caml_stat_free(domain); } static memprof_domain_t domain_create(caml_domain_state *caml_state) { memprof_domain_t domain = caml_stat_alloc(sizeof(memprof_domain_s)); if (!domain) { return NULL; } domain->caml_state = caml_state; domain->threads = NULL; domain->current = NULL; domain->config = Val_unit; /* create initial thread for domain */ memprof_thread_t thread = thread_create(domain); if (thread) { domain->current = thread; } else { domain_destroy(domain); domain = NULL; } return domain; } /**** Interface to domain module ***/ void caml_memprof_new_domain(caml_domain_state *parent, caml_domain_state *child) { memprof_domain_t domain = domain_create(child); child->memprof = domain; /* domain inherits configuration from parent */ if (domain && parent) { domain->config = parent->memprof->config; } } void caml_memprof_delete_domain(caml_domain_state *state) { if (!state->memprof) { return; } domain_destroy(state->memprof); state->memprof = NULL; } /**** Interface with domain action-pending flag ****/ /* If profiling is active in the current domain, and we may have some * callbacks pending, set the action pending flag. */ static void set_action_pending_as_needed(memprof_domain_t domain) { /* if (condition) caml_set_action_pending(domain->caml_state); */ } /* Set the suspended flag on `domain` to `s`. */ static void update_suspended(memprof_domain_t domain, bool s) { if (domain->current) { domain->current->suspended = s; } caml_memprof_renew_minor_sample(domain->caml_state); if (!s) set_action_pending_as_needed(domain); } /* Set the suspended flag on the current domain to `s`. */ void caml_memprof_update_suspended(bool s) { update_suspended(Caml_state->memprof, s); } /**** Sampling procedures ****/ Caml_inline bool running(memprof_domain_t domain) { memprof_thread_t thread = domain->current; if (thread && !thread->suspended) { value config = domain->config; return Running(config); } return false; } /* Renew the next sample in a domain's minor heap. Could race with * sampling and profile-stopping code, so do not call from another * domain unless the world is stopped. Must be called after each minor * sample and after each minor collection. In practice, this is called * at each minor sample, at each minor collection, and when sampling * is suspended and unsuspended. Extra calls do not change the * statistical properties of the sampling because of the * memorylessness of the geometric distribution. */ void caml_memprof_renew_minor_sample(caml_domain_state *state) { memprof_domain_t domain = state->memprof; value *trigger = state->young_start; if (running(domain)) { /* set trigger based on geometric distribution */ } CAMLassert((trigger >= state->young_start) && (trigger <= state->young_ptr)); state->memprof_young_trigger = trigger; caml_reset_young_limit(state); } /**** Interface with systhread. ****/ CAMLexport memprof_thread_t caml_memprof_new_thread(caml_domain_state *state) { return thread_create(state->memprof); } CAMLexport memprof_thread_t caml_memprof_main_thread(caml_domain_state *state) { memprof_domain_t domain = state->memprof; memprof_thread_t thread = domain->threads; /* There should currently be just one thread in this domain */ CAMLassert(thread); CAMLassert(thread->next == NULL); return thread; } CAMLexport void caml_memprof_delete_thread(memprof_thread_t thread) { thread_destroy(thread); } CAMLexport void caml_memprof_enter_thread(memprof_thread_t thread) { thread->domain->current = thread; update_suspended(thread->domain, thread->suspended); } /**** Interface to OCaml ****/ #include "caml/fail.h" CAMLprim value caml_memprof_start(value lv, value szv, value tracker_param) { caml_failwith("Gc.Memprof.start: not implemented in multicore"); } CAMLprim value caml_memprof_stop(value unit) { caml_failwith("Gc.Memprof.stop: not implemented in multicore"); } CAMLprim value caml_memprof_discard(value profile) { caml_failwith("Gc.Memprof.discard: not implemented in multicore"); } /* FIXME: integrate memprof with multicore */ #if 0 #include #include "caml/memprof.h" #include "caml/fail.h" #include "caml/alloc.h" #include "caml/callback.h" #include "caml/signals.h" #include "caml/memory.h" #include "caml/minor_gc.h" #include "caml/backtrace_prim.h" #include "caml/weak.h" #include "caml/stack.h" #include "caml/misc.h" #include "caml/printexc.h" #include "caml/runtime_events.h" #define RAND_BLOCK_SIZE 64 static uint32_t xoshiro_state[4][RAND_BLOCK_SIZE]; static uintnat rand_geom_buff[RAND_BLOCK_SIZE]; static uint32_t rand_pos; /* [lambda] is the mean number of samples for each allocated word (including block headers). */ static double lambda = 0; /* Precomputed value of [1/log(1-lambda)], for fast sampling of geometric distribution. Dummy if [lambda = 0]. */ static float one_log1m_lambda; static intnat callstack_size; /* accessors for the OCaml type [Gc.Memprof.tracker], which is the type of the [tracker] global below. */ #define Alloc_minor(tracker) (Field(tracker, 0)) #define Alloc_major(tracker) (Field(tracker, 1)) #define Promote(tracker) (Field(tracker, 2)) #define Dealloc_minor(tracker) (Field(tracker, 3)) #define Dealloc_major(tracker) (Field(tracker, 4)) static value tracker; /* Gc.Memprof.allocation_source */ enum { SRC_NORMAL = 0, SRC_MARSHAL = 1, SRC_CUSTOM = 2 }; struct tracked { /* Memory block being sampled. This is a weak GC root. */ value block; /* Number of samples in this block. */ uintnat n_samples; /* The size of this block. */ uintnat wosize; /* The value returned by the previous callback for this block, or the callstack if the alloc callback has not been called yet. This is a strong GC root. */ value user_data; /* The thread currently running a callback for this entry, or NULL if there is none */ struct caml_memprof_th_ctx* running; /* Whether this block has been initially allocated in the minor heap. */ unsigned int alloc_young : 1; /* The source of the allocation: normal allocations, marshal or custom_mem. */ unsigned int source : 2; /* Whether this block has been promoted. Implies [alloc_young]. */ unsigned int promoted : 1; /* Whether this block has been deallocated. */ unsigned int deallocated : 1; /* Whether the allocation callback has been called depends on whether the entry is in a thread local entry array or in [entries_global]. */ /* Whether the promotion callback has been called. */ unsigned int cb_promote_called : 1; /* Whether the deallocation callback has been called. */ unsigned int cb_dealloc_called : 1; /* Whether this entry is deleted. */ unsigned int deleted : 1; }; /* During the alloc callback for a minor allocation, the block being sampled is not yet allocated. Instead, we place in the block field a value computed with the following macro: */ #define Placeholder_magic 0x04200000 #define Placeholder_offs(offset) (Val_long(offset + Placeholder_magic)) #define Offs_placeholder(block) (Long_val(block) & 0xFFFF) #define Is_placeholder(block) \ (Is_long(block) && (Long_val(block) & ~(uintnat)0xFFFF) == Placeholder_magic) /* A resizable array of entries */ struct entry_array { struct tracked* t; uintnat min_alloc_len, alloc_len, len; /* Before this position, the [block] and [user_data] fields point to the major heap ([young <= len]). */ uintnat young_idx; /* There are no blocks to be deleted before this position ([delete_idx <= len]). */ uintnat delete_idx; }; #define MIN_ENTRIES_LOCAL_ALLOC_LEN 16 #define MIN_ENTRIES_GLOBAL_ALLOC_LEN 128 /* Entries for other blocks. This variable is shared across threads. */ static struct entry_array entries_global = { NULL, MIN_ENTRIES_GLOBAL_ALLOC_LEN, 0, 0, 0, 0 }; /* There are no pending callbacks in [entries_global] before this position ([callback_idx <= entries_global.len]). */ static uintnat callback_idx; #define CB_IDLE -1 #define CB_LOCAL -2 #define CB_STOPPED -3 /* Structure for thread-local variables. */ struct caml_memprof_th_ctx { /* [suspended] is used for masking memprof callbacks when a callback is running or when an uncaught exception handler is called. */ int suspended; /* [callback_status] contains: - CB_STOPPED if the current thread is running a callback, but sampling has been stopped using [caml_memprof_stop]; - The index of the corresponding entry in the [entries_global] array if the current thread is currently running a promotion or a deallocation callback; - CB_LOCAL if the current thread is currently running an allocation callback; - CB_IDLE if the current thread is not running any callback. */ intnat callback_status; /* Entries for blocks whose alloc callback has not yet been called. */ struct entry_array entries; } caml_memprof_main_ctx = { 0, CB_IDLE, { NULL, MIN_ENTRIES_LOCAL_ALLOC_LEN, 0, 0, 0, 0 } }; static struct caml_memprof_th_ctx* local = &caml_memprof_main_ctx; /* Pointer to the word following the next sample in the minor heap. Equals [Caml_state->young_alloc_start] if no sampling is planned in the current minor heap. Invariant: [caml_memprof_young_trigger <= Caml_state->young_ptr]. */ value* caml_memprof_young_trigger; /* Whether memprof has been initialized. */ static int init = 0; /* Whether memprof is started. */ static int started = 0; /* Buffer used to compute backtraces */ static value* callstack_buffer = NULL; static intnat callstack_buffer_len = 0; /**** Statistical sampling ****/ Caml_inline uint64_t splitmix64_next(uint64_t* x) { uint64_t z = (*x += 0x9E3779B97F4A7C15ull); z = (z ^ (z >> 30)) * 0xBF58476D1CE4E5B9ull; z = (z ^ (z >> 27)) * 0x94D049BB133111EBull; return z ^ (z >> 31); } static void xoshiro_init(void) { int i; uint64_t splitmix64_state = 42; rand_pos = RAND_BLOCK_SIZE; for (i = 0; i < RAND_BLOCK_SIZE; i++) { uint64_t t = splitmix64_next(&splitmix64_state); xoshiro_state[0][i] = t & 0xFFFFFFFF; xoshiro_state[1][i] = t >> 32; t = splitmix64_next(&splitmix64_state); xoshiro_state[2][i] = t & 0xFFFFFFFF; xoshiro_state[3][i] = t >> 32; } } Caml_inline uint32_t xoshiro_next(int i) { uint32_t res = xoshiro_state[0][i] + xoshiro_state[3][i]; uint32_t t = xoshiro_state[1][i] << 9; xoshiro_state[2][i] ^= xoshiro_state[0][i]; xoshiro_state[3][i] ^= xoshiro_state[1][i]; xoshiro_state[1][i] ^= xoshiro_state[2][i]; xoshiro_state[0][i] ^= xoshiro_state[3][i]; xoshiro_state[2][i] ^= t; t = xoshiro_state[3][i]; xoshiro_state[3][i] = (t << 11) | (t >> 21); return res; } /* Computes [log((y+0.5)/2^32)], up to a relatively good precision, and guarantee that the result is negative. The average absolute error is very close to 0. */ Caml_inline float log_approx(uint32_t y) { union { float f; int32_t i; } u; float exp, x; u.f = y + 0.5f; /* We convert y to a float ... */ exp = u.i >> 23; /* ... of which we extract the exponent ... */ u.i = (u.i & 0x7FFFFF) | 0x3F800000; x = u.f; /* ... and the mantissa. */ return /* This polynomial computes the logarithm of the mantissa (which is in [1, 2]), up to an additive constant. It is chosen such that : - Its degree is 4. - Its average value is that of log in [1, 2] (the sampling has the right mean when lambda is small). - f(1) = f(2) - log(2) = -159*log(2) - 1e-5 (this guarantee that log_approx(y) is always <= -1e-5 < 0). - The maximum of abs(f(x)-log(x)+159*log(2)) is minimized. */ x * (2.104659476859f + x * (-0.720478916626f + x * 0.107132064797f)) /* Then, we add the term corresponding to the exponent, and additive constants. */ + (-111.701724334061f + 0.6931471805f*exp); } /* This function regenerates [MT_STATE_SIZE] geometric random variables at once. Doing this by batches help us gain performances: many compilers (e.g., GCC, CLang, ICC) will be able to use SIMD instructions to get a performance boost. */ #ifdef SUPPORTS_TREE_VECTORIZE __attribute__((optimize("tree-vectorize"))) #endif static void rand_batch(void) { int i; /* Instead of using temporary buffers, we could use one big loop, but it turns out SIMD optimizations of compilers are more fragile when using larger loops. */ static uint32_t A[RAND_BLOCK_SIZE]; static float B[RAND_BLOCK_SIZE]; CAMLassert(lambda > 0.); /* Shuffle the xoshiro samplers, and generate uniform variables in A. */ for (i = 0; i < RAND_BLOCK_SIZE; i++) A[i] = xoshiro_next(i); /* Generate exponential random variables by computing logarithms. We do not use math.h library functions, which are slow and prevent compiler from using SIMD instructions. */ for (i = 0; i < RAND_BLOCK_SIZE; i++) B[i] = 1 + log_approx(A[i]) * one_log1m_lambda; /* We do the final flooring for generating geometric variables. Compilers are unlikely to use SIMD instructions for this loop, because it involves a conditional and variables of different sizes (32 and 64 bits). */ for (i = 0; i < RAND_BLOCK_SIZE; i++) { double f = B[i]; CAMLassert (f >= 1); /* [Max_long+1] is a power of two => no rounding in the test. */ if (f >= Max_long+1) rand_geom_buff[i] = Max_long; else rand_geom_buff[i] = (uintnat)f; } rand_pos = 0; } /* Simulate a geometric variable of parameter [lambda]. The result is clipped in [1..Max_long] */ static uintnat rand_geom(void) { uintnat res; CAMLassert(lambda > 0.); if (rand_pos == RAND_BLOCK_SIZE) rand_batch(); res = rand_geom_buff[rand_pos++]; CAMLassert(1 <= res && res <= Max_long); return res; } static uintnat next_rand_geom; /* Simulate a binomial variable of parameters [len] and [lambda]. This sampling algorithm has running time linear with [len * lambda]. We could use more a involved algorithm, but this should be good enough since, in the average use case, [lambda] <= 0.01 and therefore the generation of the binomial variable is amortized by the initialization of the corresponding block. If needed, we could use algorithm BTRS from the paper: Hormann, Wolfgang. "The generation of binomial random variates." Journal of statistical computation and simulation 46.1-2 (1993), pp101-110. */ static uintnat rand_binom(uintnat len) { uintnat res; CAMLassert(lambda > 0. && len < Max_long); for (res = 0; next_rand_geom < len; res++) next_rand_geom += rand_geom(); next_rand_geom -= len; return res; } /**** Capturing the call stack *****/ /* This function is called in, e.g., [caml_alloc_shr], which guarantees that the GC is not called. Clients may use it in a context where the heap is in an invalid state, or when the roots are not properly registered. Therefore, we do not use [caml_alloc], which may call the GC, but prefer using [caml_alloc_shr], which gives this guarantee. The return value is either a valid callstack or 0 in out-of-memory scenarios. */ static value capture_callstack_postponed() { value res; intnat callstack_len = caml_collect_current_callstack(&callstack_buffer, &callstack_buffer_len, callstack_size, -1); if (callstack_len == 0) return Atom(0); res = caml_alloc_shr_no_track_noexc(callstack_len, 0); if (res == 0) return Atom(0); memcpy(Op_val(res), callstack_buffer, sizeof(value) * callstack_len); if (callstack_buffer_len > 256 && callstack_buffer_len > callstack_len * 8) { caml_stat_free(callstack_buffer); callstack_buffer = NULL; callstack_buffer_len = 0; } return res; } /* In this version, we are allowed to call the GC, so we use [caml_alloc], which is more efficient since it uses the minor heap. Should be called with [local->suspended == 1] */ static value capture_callstack(int alloc_idx) { value res; intnat callstack_len = caml_collect_current_callstack(&callstack_buffer, &callstack_buffer_len, callstack_size, alloc_idx); CAMLassert(local->suspended); res = caml_alloc(callstack_len, 0); memcpy(Op_val(res), callstack_buffer, sizeof(value) * callstack_len); if (callstack_buffer_len > 256 && callstack_buffer_len > callstack_len * 8) { caml_stat_free(callstack_buffer); callstack_buffer = NULL; callstack_buffer_len = 0; } return res; } /**** Managing data structures for tracked blocks. ****/ /* Reallocate the [ea] array if it is either too small or too large. [grow] is the number of free cells needed. Returns 1 if reallocation succeeded --[ea->alloc_len] is at least [ea->len+grow]--, and 0 otherwise. */ static int realloc_entries(struct entry_array* ea, uintnat grow) { uintnat new_alloc_len, new_len = ea->len + grow; struct tracked* new_t; if (new_len <= ea->alloc_len && (4*new_len >= ea->alloc_len || ea->alloc_len == ea->min_alloc_len)) return 1; new_alloc_len = new_len * 2; if (new_alloc_len < ea->min_alloc_len) new_alloc_len = ea->min_alloc_len; new_t = caml_stat_resize_noexc(ea->t, new_alloc_len * sizeof(struct tracked)); if (new_t == NULL) return 0; ea->t = new_t; ea->alloc_len = new_alloc_len; return 1; } #define Invalid_index (~(uintnat)0) Caml_inline uintnat new_tracked(uintnat n_samples, uintnat wosize, int source, int is_young, value block, value user_data) { struct tracked *t; if (!realloc_entries(&local->entries, 1)) return Invalid_index; local->entries.len++; t = &local->entries.t[local->entries.len - 1]; t->block = block; t->n_samples = n_samples; t->wosize = wosize; t->user_data = user_data; t->running = NULL; t->alloc_young = is_young; t->source = source; t->promoted = 0; t->deallocated = 0; t->cb_promote_called = t->cb_dealloc_called = 0; t->deleted = 0; return local->entries.len - 1; } static void mark_deleted(struct entry_array* ea, uintnat t_idx) { struct tracked* t = &ea->t[t_idx]; t->deleted = 1; t->user_data = Val_unit; t->block = Val_unit; if (t_idx < ea->delete_idx) ea->delete_idx = t_idx; } Caml_inline value run_callback_exn( struct entry_array* ea, uintnat t_idx, value cb, value param) { struct tracked* t = &ea->t[t_idx]; value res; CAMLassert(t->running == NULL); CAMLassert(lambda > 0.); local->callback_status = ea == &entries_global ? t_idx : CB_LOCAL; t->running = local; t->user_data = Val_unit; /* Release root. */ res = caml_callback_exn(cb, param); if (local->callback_status == CB_STOPPED) { /* Make sure this entry has not been removed by [caml_memprof_stop] */ local->callback_status = CB_IDLE; return Is_exception_result(res) ? res : Val_unit; } /* The call above can move the tracked entry and thus invalidate [t_idx] and [t]. */ if (ea == &entries_global) { CAMLassert(local->callback_status >= 0 && local->callback_status < ea->len); t_idx = local->callback_status; t = &ea->t[t_idx]; } local->callback_status = CB_IDLE; CAMLassert(t->running == local); t->running = NULL; if (Is_exception_result(res) || res == Val_unit) { /* Callback raised an exception or returned None or (), discard this entry. */ mark_deleted(ea, t_idx); return res; } else { /* Callback returned [Some _]. Store the value in [user_data]. */ CAMLassert(!Is_exception_result(res) && Is_block(res) && Tag_val(res) == 0 && Wosize_val(res) == 1); t->user_data = Field(res, 0); if (Is_block(t->user_data) && Is_young(t->user_data) && t_idx < ea->young_idx) ea->young_idx = t_idx; // If the following condition are met: // - we are running a promotion callback, // - the corresponding block is deallocated, // - another thread is running callbacks in // [caml_memprof_handle_postponed_exn], // then [callback_idx] may have moved forward during this callback, // which means that we may forget to run the deallocation callback. // Hence, we reset [callback_idx] if appropriate. if (ea == &entries_global && t->deallocated && !t->cb_dealloc_called && callback_idx > t_idx) callback_idx = t_idx; return Val_unit; } } /* Run the allocation callback for a given entry of the local entries array. This assumes that the corresponding [deleted] and [running] fields of the entry are both set to 0. Reentrancy is not a problem for this function, since other threads will use a different array for entries. The index of the entry will not change, except if [caml_memprof_stop] is called . Returns: - An exception result if the callback raised an exception - Val_long(0) == Val_unit == None otherwise */ static value run_alloc_callback_exn(uintnat t_idx) { struct tracked* t = &local->entries.t[t_idx]; value sample_info; CAMLassert(Is_block(t->block) || Is_placeholder(t->block) || t->deallocated); sample_info = caml_alloc_small(4, 0); Field(sample_info, 0) = Val_long(t->n_samples); Field(sample_info, 1) = Val_long(t->wosize); Field(sample_info, 2) = Val_long(t->source); Field(sample_info, 3) = t->user_data; return run_callback_exn(&local->entries, t_idx, t->alloc_young ? Alloc_minor(tracker) : Alloc_major(tracker), sample_info); } /* Remove any deleted entries from [ea], updating [ea->young_idx] and [callback_idx] if [ea == &entries_global]. */ static void flush_deleted(struct entry_array* ea) { uintnat i, j; if (ea == NULL) return; j = i = ea->delete_idx; while (i < ea->len) { if (!ea->t[i].deleted) { struct caml_memprof_th_ctx* runner = ea->t[i].running; if (runner != NULL && runner->callback_status == i) runner->callback_status = j; ea->t[j] = ea->t[i]; j++; } i++; if (ea->young_idx == i) ea->young_idx = j; if (ea == &entries_global && callback_idx == i) callback_idx = j; } ea->delete_idx = ea->len = j; CAMLassert(ea != &entries_global || callback_idx <= ea->len); CAMLassert(ea->young_idx <= ea->len); realloc_entries(ea, 0); } static void check_action_pending(void) { if (local->suspended) return; if (callback_idx < entries_global.len || local->entries.len > 0) caml_set_action_pending(Caml_state); } /* In case of a thread context switch during a callback, this can be called in a reetrant way. */ value caml_memprof_handle_postponed_exn(void) { value res = Val_unit; uintnat i; if (local->suspended) return Val_unit; if (callback_idx >= entries_global.len && local->entries.len == 0) return Val_unit; caml_memprof_set_suspended(1); for (i = 0; i < local->entries.len; i++) { /* We are the only thread allowed to modify [local->entries], so the indices cannot shift, but it is still possible that [caml_memprof_stop] got called during the callback, invalidating all the entries. */ res = run_alloc_callback_exn(i); if (Is_exception_result(res)) goto end; if (local->entries.len == 0) goto end; /* [caml_memprof_stop] has been called. */ if (local->entries.t[i].deleted) continue; if (realloc_entries(&entries_global, 1)) /* Transfer the entry to the global array. */ entries_global.t[entries_global.len++] = local->entries.t[i]; mark_deleted(&local->entries, i); } while (callback_idx < entries_global.len) { struct tracked* t = &entries_global.t[callback_idx]; if (t->deleted || t->running != NULL) { /* This entry is not ready. Ignore it. */ callback_idx++; } else if (t->promoted && !t->cb_promote_called) { t->cb_promote_called = 1; res = run_callback_exn(&entries_global, callback_idx, Promote(tracker), t->user_data); if (Is_exception_result(res)) goto end; } else if (t->deallocated && !t->cb_dealloc_called) { value cb = (t->promoted || !t->alloc_young) ? Dealloc_major(tracker) : Dealloc_minor(tracker); t->cb_dealloc_called = 1; res = run_callback_exn(&entries_global, callback_idx, cb, t->user_data); if (Is_exception_result(res)) goto end; } else { /* There is nothing more to do with this entry. */ callback_idx++; } } end: flush_deleted(&local->entries); flush_deleted(&entries_global); /* We need to reset the suspended flag *after* flushing [local->entries] to make sure the floag is not set back to 1. */ caml_memprof_set_suspended(0); return res; } /**** Handling weak and strong roots when the GC runs. ****/ typedef void (*ea_action)(struct entry_array*, void*); struct call_on_entry_array_data { ea_action f; void *data; }; static void call_on_entry_array(struct caml_memprof_th_ctx* ctx, void *data) { struct call_on_entry_array_data* closure = data; closure->f(&ctx->entries, closure->data); } static void entry_arrays_iter(ea_action f, void *data) { struct call_on_entry_array_data closure = { f, data }; f(&entries_global, data); caml_memprof_th_ctx_iter_hook(call_on_entry_array, &closure); } static void entry_array_oldify_young_roots(struct entry_array *ea, void *data) { uintnat i; (void)data; /* This loop should always have a small number of iterations (when compared to the size of the minor heap), because the young_idx pointer should always be close to the end of the array. Indeed, it is only moved back when returning from a callback triggered by allocation or promotion, which can only happen for blocks allocated recently, which are close to the end of the [entries_global] array. */ for (i = ea->young_idx; i < ea->len; i++) caml_oldify_one(ea->t[i].user_data, &ea->t[i].user_data); } void caml_memprof_oldify_young_roots(void) { entry_arrays_iter(entry_array_oldify_young_roots, NULL); } static void entry_array_minor_update(struct entry_array *ea, void *data) { uintnat i; (void)data; /* See comment in [entry_array_oldify_young_roots] for the number of iterations of this loop. */ for (i = ea->young_idx; i < ea->len; i++) { struct tracked *t = &ea->t[i]; CAMLassert(Is_block(t->block) || t->deleted || t->deallocated || Is_placeholder(t->block)); if (Is_block(t->block) && Is_young(t->block)) { if (Hd_val(t->block) == 0) { /* Block has been promoted */ t->block = Field(t->block, 0); t->promoted = 1; } else { /* Block is dead */ CAMLassert_young_header(Hd_val(t->block)); t->block = Val_unit; t->deallocated = 1; } } } ea->young_idx = ea->len; } void caml_memprof_minor_update(void) { if (callback_idx > entries_global.young_idx) { /* The entries after [entries_global.young_idx] will possibly get promoted. Hence, there might be pending promotion callbacks. */ callback_idx = entries_global.young_idx; check_action_pending(); } entry_arrays_iter(entry_array_minor_update, NULL); } static void entry_array_do_roots(struct entry_array *ea, void* data) { scanning_action f = data; uintnat i; for (i = 0; i < ea->len; i++) f(ea->t[i].user_data, &ea->t[i].user_data); } void caml_memprof_do_roots(scanning_action f) { entry_arrays_iter(entry_array_do_roots, f); } static void entry_array_clean_phase(struct entry_array *ea, void* data) { uintnat i; (void)data; for (i = 0; i < ea->len; i++) { struct tracked *t = &ea->t[i]; if (Is_block(t->block) && !Is_young(t->block)) { CAMLassert(Is_in_heap(t->block)); CAMLassert(!t->alloc_young || t->promoted); if (Is_white_val(t->block)) { t->block = Val_unit; t->deallocated = 1; } } } } void caml_memprof_update_clean_phase(void) { entry_arrays_iter(entry_array_clean_phase, NULL); callback_idx = 0; check_action_pending(); } static void entry_array_invert(struct entry_array *ea, void *data) { uintnat i; (void)data; for (i = 0; i < ea->len; i++) caml_invert_root(ea->t[i].block, &ea->t[i].block); } void caml_memprof_invert_tracked(void) { entry_arrays_iter(entry_array_invert, NULL); } /**** Sampling procedures ****/ static void maybe_track_block(value block, uintnat n_samples, uintnat wosize, int src) { value callstack; if (n_samples == 0) return; callstack = capture_callstack_postponed(); if (callstack == 0) return; new_tracked(n_samples, wosize, src, Is_young(block), block, callstack); check_action_pending(); } void caml_memprof_track_alloc_shr(value block) { CAMLassert(Is_in_heap(block)); if (lambda == 0 || local->suspended) return; maybe_track_block(block, rand_binom(Whsize_val(block)), Wosize_val(block), SRC_NORMAL); } void caml_memprof_track_custom(value block, mlsize_t bytes) { CAMLassert(Is_young(block) || Is_in_heap(block)); if (lambda == 0 || local->suspended) return; maybe_track_block(block, rand_binom(Wsize_bsize(bytes)), Wsize_bsize(bytes), SRC_CUSTOM); } /* Shifts the next sample in the minor heap by [n] words. Essentially, this tells the sampler to ignore the next [n] words of the minor heap. */ static void shift_sample(uintnat n) { if (caml_memprof_young_trigger - Caml_state->young_alloc_start > n) caml_memprof_young_trigger -= n; else caml_memprof_young_trigger = Caml_state->young_alloc_start; caml_reset_young_limit(Caml_state); } /* Called when exceeding the threshold for the next sample in the minor heap, from the C code (the handling is different when called from natively compiled OCaml code). */ void caml_memprof_track_young(uintnat wosize, int from_caml, int nallocs, unsigned char* encoded_alloc_lens) { uintnat whsize = Whsize_wosize(wosize); value callstack, res = Val_unit; int alloc_idx = 0, i, allocs_sampled = 0; intnat alloc_ofs, trigger_ofs; double saved_lambda = lambda; /* If this condition is false, then [caml_memprof_young_trigger] should be equal to [Caml_state->young_alloc_start]. But this function is only called with [Caml_state->young_alloc_start <= Caml_state->young_ptr < caml_memprof_young_trigger], which is contradictory. */ CAMLassert(!local->suspended && lambda > 0); if (!from_caml) { unsigned n_samples = 1 + rand_binom(caml_memprof_young_trigger - 1 - Caml_state->young_ptr); CAMLassert(encoded_alloc_lens == NULL); /* No Comballoc in C! */ caml_memprof_renew_minor_sample(); maybe_track_block(Val_hp(Caml_state->young_ptr), n_samples, wosize, SRC_NORMAL); return; } /* We need to call the callbacks for this sampled block. Since each callback can potentially allocate, the sampled block will *not* be the one pointed to by [caml_memprof_young_trigger]. Instead, we remember that we need to sample the next allocated word, call the callback and use as a sample the block which will be allocated right after the callback. */ CAMLassert(Caml_state->young_ptr < caml_memprof_young_trigger && caml_memprof_young_trigger <= Caml_state->young_ptr + whsize); trigger_ofs = caml_memprof_young_trigger - Caml_state->young_ptr; alloc_ofs = whsize; /* Restore the minor heap in a valid state for calling the callbacks. We should not call the GC before these two instructions. */ Caml_state->young_ptr += whsize; caml_memprof_set_suspended(1); // This also updates the memprof trigger /* Perform the sampling of the block in the set of Comballoc'd blocks, insert them in the entries array, and run the callbacks. */ for (alloc_idx = nallocs - 1; alloc_idx >= 0; alloc_idx--) { unsigned alloc_wosz = encoded_alloc_lens == NULL ? wosize : Wosize_encoded_alloc_len(encoded_alloc_lens[alloc_idx]); unsigned n_samples = 0; alloc_ofs -= Whsize_wosize(alloc_wosz); while (alloc_ofs < trigger_ofs) { n_samples++; trigger_ofs -= rand_geom(); } if (n_samples > 0) { uintnat t_idx; int stopped; callstack = capture_callstack(alloc_idx); t_idx = new_tracked(n_samples, alloc_wosz, SRC_NORMAL, 1, Placeholder_offs(alloc_ofs), callstack); if (t_idx == Invalid_index) continue; res = run_alloc_callback_exn(t_idx); /* Has [caml_memprof_stop] been called during the callback? */ stopped = local->entries.len == 0; if (stopped) { allocs_sampled = 0; if (saved_lambda != lambda) { /* [lambda] changed during the callback. We need to refresh [trigger_ofs]. */ saved_lambda = lambda; trigger_ofs = lambda == 0. ? 0 : alloc_ofs - (rand_geom() - 1); } } if (Is_exception_result(res)) break; if (!stopped) allocs_sampled++; } } CAMLassert(alloc_ofs == 0 || Is_exception_result(res)); CAMLassert(allocs_sampled <= nallocs); if (!Is_exception_result(res)) { /* The callbacks did not raise. The allocation will take place. We now restore the minor heap in the state needed by [Alloc_small_aux]. */ if (Caml_state->young_ptr - whsize < Caml_state->young_trigger) { CAML_EV_COUNTER(EV_C_FORCE_MINOR_MEMPROF, 1); caml_gc_dispatch(); } /* Re-allocate the blocks in the minor heap. We should not call the GC after this. */ Caml_state->young_ptr -= whsize; /* Make sure this block is not going to be sampled again. */ shift_sample(whsize); } /* Since [local->entries] is local to the current thread, we know for sure that the allocated entries are the [alloc_sampled] last entries of [local->entries]. */ for (i = 0; i < allocs_sampled; i++) { uintnat idx = local->entries.len-allocs_sampled+i; if (local->entries.t[idx].deleted) continue; if (realloc_entries(&entries_global, 1)) { /* Transfer the entry to the global array. */ struct tracked* t = &entries_global.t[entries_global.len]; entries_global.len++; *t = local->entries.t[idx]; if (Is_exception_result(res)) { /* The allocations are cancelled because of the exception, but this callback has already been called. We simulate a deallocation. */ t->block = Val_unit; t->deallocated = 1; } else { /* If the execution of the callback has succeeded, then we start the tracking of this block.. Subtlety: we are actually writing [t->block] with an invalid (uninitialized) block. This is correct because the allocation and initialization happens right after returning from [caml_memprof_track_young]. */ t->block = Val_hp(Caml_state->young_ptr + Offs_placeholder(t->block)); /* We make sure that the action pending flag is not set systematically, which is to be expected, since we created a new block in the global entry array, but this new block does not need promotion or deallocationc callback. */ if (callback_idx == entries_global.len - 1) callback_idx = entries_global.len; } } mark_deleted(&local->entries, idx); } flush_deleted(&local->entries); /* We need to reset the suspended flag *after* flushing [local->entries] to make sure the floag is not set back to 1. */ caml_memprof_set_suspended(0); if (Is_exception_result(res)) caml_raise(Extract_exception(res)); /* /!\ Since the heap is in an invalid state before initialization, very little heap operations are allowed until then. */ return; } void caml_memprof_track_interned(header_t* block, header_t* blockend) { header_t *p; value callstack = 0; int is_young = Is_young(Val_hp(block)); if (lambda == 0 || local->suspended) return; p = block; while (1) { uintnat next_sample = rand_geom(); header_t *next_sample_p, *next_p; if (next_sample > blockend - p) break; /* [next_sample_p] is the block *following* the next sampled block! */ next_sample_p = p + next_sample; while (1) { next_p = p + Whsize_hp(p); if (next_p >= next_sample_p) break; p = next_p; } if (callstack == 0) callstack = capture_callstack_postponed(); if (callstack == 0) break; /* OOM */ new_tracked(rand_binom(next_p - next_sample_p) + 1, Wosize_hp(p), SRC_MARSHAL, is_young, Val_hp(p), callstack); p = next_p; } check_action_pending(); } /**** Interface with the OCaml code. ****/ static void caml_memprof_init(void) { init = 1; xoshiro_init(); } CAMLprim value caml_memprof_start(value lv, value szv, value tracker_param) { CAMLparam3(lv, szv, tracker_param); double l = Double_val(lv); intnat sz = Long_val(szv); if (started) caml_failwith("Gc.Memprof.start: already started."); if (sz < 0 || !(l >= 0.) || l > 1.) /* Checks that [l] is not NAN. */ caml_invalid_argument("Gc.Memprof.start"); if (!init) caml_memprof_init(); lambda = l; if (l > 0) { one_log1m_lambda = l == 1 ? 0 : 1/caml_log1p(-l); rand_pos = RAND_BLOCK_SIZE; /* next_rand_geom can be zero if the next word is to be sampled, but rand_geom always returns a value >= 1. Subtract 1 to correct. */ next_rand_geom = rand_geom() - 1; } caml_memprof_renew_minor_sample(); callstack_size = sz; started = 1; tracker = tracker_param; caml_register_generational_global_root(&tracker); CAMLreturn(Val_unit); } static void empty_entry_array(struct entry_array *ea) { if (ea != NULL) { ea->alloc_len = ea->len = ea->young_idx = ea->delete_idx = 0; caml_stat_free(ea->t); ea->t = NULL; } } CAMLprim value caml_memprof_stop(value unit) { if (!started) caml_failwith("Gc.Memprof.stop: not started."); /* Discard the tracked blocks in the global entries array. */ empty_entry_array(&entries_global); /* Discard the tracked blocks in the local entries array, and set [callback_status] to [CB_STOPPED]. */ caml_memprof_th_ctx_iter_hook(th_ctx_memprof_stop, NULL); callback_idx = 0; lambda = 0; // Reset the memprof trigger in order to make sure we won't enter // [caml_memprof_track_young]. caml_memprof_renew_minor_sample(); started = 0; caml_remove_generational_global_root(&tracker); caml_stat_free(callstack_buffer); callstack_buffer = NULL; callstack_buffer_len = 0; return Val_unit; } #endif