diff --git a/lib/cdo/app_server_hooks.rb b/lib/cdo/app_server_hooks.rb index 8a938dc8f8a41..cef30c2d333c3 100644 --- a/lib/cdo/app_server_hooks.rb +++ b/lib/cdo/app_server_hooks.rb @@ -1,3 +1,5 @@ +require 'cdo/process_memory' + module Cdo # NOTE: these hooks are only executed when running in puma clustered mode, which spawns worker processes. # These hooks will NOT be run in local development unless you set `dashboard_workers: 1` (or greater) @@ -25,6 +27,42 @@ def self.before_fork restart_period = DCDO.get('web_service_process_restart_period', 12 * 3600) # default to 12 hours PumaWorkerKiller.enable_rolling_restart(restart_period) end + + # Compact heap before forking child puma processes to reduce the number of heap pages occupied by long-lived + # objects, which reduces the surface area for Copy-on-Write erosion. + unless @compacted_heap_before_worker_fork + @compacted_heap_before_worker_fork = true + + begin + before_gc = GC.stat + started_at = Process.clock_gettime(Process::CLOCK_MONOTONIC) + Cdo::ProcessMemory.log_snapshot( + 'Compacting Ruby heap before Puma worker fork', + fields: { + heap_allocated_pages: before_gc[:heap_allocated_pages], + heap_live_slots: before_gc[:heap_live_slots], + old_objects: before_gc[:old_objects] + } + ) + + GC.start(full_mark: true, immediate_sweep: true) + GC.compact + + after_gc = GC.stat + duration = Process.clock_gettime(Process::CLOCK_MONOTONIC) - started_at + Cdo::ProcessMemory.log_snapshot( + 'Compacted Ruby heap before Puma worker fork', + fields: { + duration_seconds: duration.round(3), + heap_allocated_pages: after_gc[:heap_allocated_pages], + heap_live_slots: after_gc[:heap_live_slots], + old_objects: after_gc[:old_objects] + } + ) + rescue StandardError => exception + CDO.log.warn("Failed to compact Ruby heap before Puma worker fork: #{exception.class}: #{exception.message}") + end + end end def self.before_worker_boot(host:) diff --git a/lib/cdo/process_memory.rb b/lib/cdo/process_memory.rb new file mode 100644 index 0000000000000..4ae9010af80be --- /dev/null +++ b/lib/cdo/process_memory.rb @@ -0,0 +1,57 @@ +require 'cdo' + +module Cdo + # Lightweight process-memory snapshots for diagnostics. + # + # On Linux, this reads /proc/$pid/status and /proc/$pid/smaps_rollup and returns + # selected values in kilobytes. On macOS and other systems without those procfs + # files, it returns an empty hash. Missing or unreadable procfs files are + # treated as unavailable metrics, not as errors. + module ProcessMemory + STATUS_FIELDS = { + 'VmRSS' => :proc_vm_rss_kb, + 'VmHWM' => :proc_vm_hwm_kb, + 'VmSize' => :proc_vm_size_kb, + 'VmData' => :proc_vm_data_kb, + 'VmSwap' => :proc_vm_swap_kb, + 'Threads' => :proc_threads + }.freeze + + SMAPS_ROLLUP_FIELDS = { + 'Rss' => :smaps_rss_kb, + 'Pss' => :smaps_pss_kb, + 'Private_Clean' => :smaps_private_clean_kb, + 'Private_Dirty' => :smaps_private_dirty_kb, + 'Shared_Clean' => :smaps_shared_clean_kb, + 'Shared_Dirty' => :smaps_shared_dirty_kb + }.freeze + + def self.snapshot_kb(pid: Process.pid) + snapshot = {} + read_kb_fields("/proc/#{pid}/status", STATUS_FIELDS, snapshot) + read_kb_fields("/proc/#{pid}/smaps_rollup", SMAPS_ROLLUP_FIELDS, snapshot) + snapshot + end + + def self.log_snapshot(message, fields: {}, pid: Process.pid) + metrics = fields.merge(snapshot_kb(pid: pid)) + CDO.log.info("#{message}: #{metrics.map {|key, value| "#{key}=#{value}"}.join(', ')}") + metrics + end + + def self.read_kb_fields(path, field_names, snapshot) + return snapshot unless File.readable?(path) + + File.foreach(path) do |line| + key, value = line.split(':', 2) + metric_name = field_names[key] + snapshot[metric_name] = value.to_i if metric_name + end + + snapshot + rescue Errno::ENOENT, Errno::EACCES + snapshot + end + private_class_method :read_kb_fields + end +end