diff --git a/.server-changes/reduce-otel-ingestion-log-volume.md b/.server-changes/reduce-otel-ingestion-log-volume.md new file mode 100644 index 0000000000..fb9be2d5ad --- /dev/null +++ b/.server-changes/reduce-otel-ingestion-log-volume.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: improvement +--- + +Move per-batch ClickHouse event-insert logs to the debug level to cut default log volume, and add an `HTTP_ACCESS_LOG_DISABLED` env var that suppresses successful (2xx) HTTP access logs while still logging errors. diff --git a/apps/webapp/app/eventLoopMonitor.server.ts b/apps/webapp/app/eventLoopMonitor.server.ts index 0a1b33690b..2c536e7bb3 100644 --- a/apps/webapp/app/eventLoopMonitor.server.ts +++ b/apps/webapp/app/eventLoopMonitor.server.ts @@ -124,7 +124,7 @@ function startEventLoopUtilizationMonitoring() { const utilization = Number.isFinite(diff.utilization) ? diff.utilization : 0; if (Math.random() < env.EVENT_LOOP_MONITOR_UTILIZATION_SAMPLE_RATE) { - logger.info("nodejs.event_loop.utilization", { utilization }); + logger.debug("nodejs.event_loop.utilization", { utilization }); } lastEventLoopUtilization = currentEventLoopUtilization; diff --git a/apps/webapp/app/v3/dynamicFlushScheduler.server.ts b/apps/webapp/app/v3/dynamicFlushScheduler.server.ts index 9f2cffec9e..afa3e13322 100644 --- a/apps/webapp/app/v3/dynamicFlushScheduler.server.ts +++ b/apps/webapp/app/v3/dynamicFlushScheduler.server.ts @@ -310,7 +310,7 @@ export class DynamicFlushScheduler { if (newConcurrency !== currentConcurrency) { this.limiter = pLimit(newConcurrency); - this.logger.info("Adjusted flush concurrency", { + this.logger.debug("Adjusted flush concurrency", { previousConcurrency: currentConcurrency, newConcurrency, queuePressure, diff --git a/apps/webapp/app/v3/eventRepository/clickhouseEventRepository.server.ts b/apps/webapp/app/v3/eventRepository/clickhouseEventRepository.server.ts index 4375718683..13e3bce053 100644 --- a/apps/webapp/app/v3/eventRepository/clickhouseEventRepository.server.ts +++ b/apps/webapp/app/v3/eventRepository/clickhouseEventRepository.server.ts @@ -269,7 +269,7 @@ export class ClickhouseEventRepository implements IEventRepository { return; } - logger.info("ClickhouseEventRepository.flushBatch Inserted batch into clickhouse", { + logger.debug("ClickhouseEventRepository.flushBatch Inserted batch into clickhouse", { events: events.length, insertResult: outcome.insertResult, sanitized: outcome.kind === "sanitized", @@ -302,7 +302,7 @@ export class ClickhouseEventRepository implements IEventRepository { return; } - logger.info("ClickhouseEventRepository.flushLlmMetricsBatch Inserted LLM metrics batch", { + logger.debug("ClickhouseEventRepository.flushLlmMetricsBatch Inserted LLM metrics batch", { rows: rows.length, sanitized: outcome.kind === "sanitized", }); @@ -421,7 +421,7 @@ export class ClickhouseEventRepository implements IEventRepository { throw insertError; } - logger.info("ClickhouseEventRepository.flushOtelMetricsBatch Inserted OTLP metrics batch", { + logger.debug("ClickhouseEventRepository.flushOtelMetricsBatch Inserted OTLP metrics batch", { rows: rows.length, }); }); diff --git a/apps/webapp/server.ts b/apps/webapp/server.ts index 435a42b6b6..e0121480ff 100644 --- a/apps/webapp/server.ts +++ b/apps/webapp/server.ts @@ -108,7 +108,16 @@ if (ENABLE_CLUSTER && cluster.isPrimary) { // more aggressive with this caching. app.use(express.static("public", { maxAge: "1h" })); - app.use(morgan("tiny")); + // On high-volume machine-ingest services (e.g. otel) the per-request access + // log dominates log volume. HTTP_ACCESS_LOG_DISABLED suppresses successful + // (2xx) access logs; non-2xx responses are always logged so errors stay visible. + const suppressSuccessfulAccessLogs = process.env.HTTP_ACCESS_LOG_DISABLED === "1"; + app.use( + morgan("tiny", { + skip: (_req, res) => + suppressSuccessfulAccessLogs && res.statusCode >= 200 && res.statusCode < 300, + }) + ); process.title = ENABLE_CLUSTER ? `node webapp-worker-${cluster.isWorker ? cluster.worker?.id : "solo"}`