From 77bb59f594059611c5a24e6012fd8684fe24a26f Mon Sep 17 00:00:00 2001 From: "Trez.One" Date: Wed, 21 May 2025 09:16:28 -0400 Subject: [PATCH] Renaming Grafana alloy config. --- .../app-configs/grafana_alloy_config.alloy.j2 | 404 ++++++++++++++++++ 1 file changed, 404 insertions(+) create mode 100644 ansible/app-configs/grafana_alloy_config.alloy.j2 diff --git a/ansible/app-configs/grafana_alloy_config.alloy.j2 b/ansible/app-configs/grafana_alloy_config.alloy.j2 new file mode 100644 index 00000000..a6441e39 --- /dev/null +++ b/ansible/app-configs/grafana_alloy_config.alloy.j2 @@ -0,0 +1,404 @@ +{% set vault_addr = 'https://vault.trez.wtf' %} +{% set secrets_path = 'rinoa-docker/env' %} + +///////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Agent globals +///////////////////////////////////////////////////////////////////////////////////////////////////////////// +local.file "endpoints" { + // The endpoints file is used to define the endpoints, credentials and options + // for the Agent export to. + filename = "/etc/alloy/endpoints.json" +} + +discovery.docker "rinoadocker" { + host = env("DOCKER_HOST") +} + +tracing { + write_to = [otelcol.exporter.otlp.tempo.input] +} +///////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Metrics +///////////////////////////////////////////////////////////////////////////////////////////////////////////// +prometheus.remote_write "mimir" { + endpoint { + url = json_path(local.file.endpoints.content, ".metrics.url")[0] + basic_auth { + username = json_path(local.file.endpoints.content, ".metrics.basicAuth.username")[0] + password = json_path(local.file.endpoints.content, ".metrics.basicAuth.password")[0] + } + } +} + +prometheus.scrape "prometheus" { + targets = [{ + __address__ = "localhost:12345", + }] + forward_to = [prometheus.remote_write.mimir.receiver] + job_name = "prometheus" +} + +prometheus.exporter.unix "rinoa" { + procfs_path = "/host/proc" + sysfs_path = "/host/sys" + rootfs_path = "/rootfs" +} + +prometheus.scrape "rinoa" { + targets = prometheus.exporter.unix.rinoa.targets + forward_to = [prometheus.remote_write.mimir.receiver] + job_name = "rinoa_host" +} + +prometheus.exporter.cadvisor "docker" { + docker_host = env("DOCKER_HOST") + storage_duration = "5m" +} + +prometheus.scrape "docker" { + targets = prometheus.exporter.cadvisor.docker.targets + forward_to = [prometheus.remote_write.mimir.receiver] + job_name = "docker_stats" +} +///////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Logging +///////////////////////////////////////////////////////////////////////////////////////////////////////////// +loki.write "loki" { + endpoint { + url = json_path(local.file.endpoints.content, ".logs.url")[0] + basic_auth { + username = json_path(local.file.endpoints.content, ".logs.basicAuth.username")[0] + password = json_path(local.file.endpoints.content, ".logs.basicAuth.password")[0] + } + } + external_labels = {} +} + +loki.source.journal "hostjournal" { + forward_to = [loki.write.loki.receiver] + max_age = "24h" + path = "/rootfs/var/log/journal/" + labels = { + job = "host-journal", + } +} + +local.file_match "system" { + path_targets = [{ + __address__ = "localhost", + __path__ = "/rootfs/var/log/*log", + job = "varlogs", + }] +} + +loki.source.file "system" { + targets = local.file_match.system.targets + forward_to = [loki.write.loki.receiver] +} + +loki.source.docker "containers" { + host = env("DOCKER_HOST") + targets = discovery.docker.rinoadocker.targets + forward_to = [loki.write.loki.receiver] + labels = { + job = "containerlogs", + } +} + +loki.process "containers" { + forward_to = [loki.write.loki.receiver] + // stage.docker {} + stage.json { + expressions = { + attrs = "", + output = "log", + stream = "stream", + } + } + + stage.json { + expressions = { + tag = "", + } + source = "attrs" + } + + stage.regex { + expression = "(?P(?:[^|]*[^|])).(?P(?:[^|]*[^|])).(?P(?:[^|]*[^|])).(?P(?:[^|]*[^|]))" + source = "tag" + } + + stage.timestamp { + source = "time" + format = "RFC3339Nano" + } + + stage.labels { + values = { + container_id = null, + container_name = null, + image_id = null, + image_name = null, + stream = null, + tag = null, + } + } + + stage.output { + source = "output" + } +} + +///////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Traces +///////////////////////////////////////////////////////////////////////////////////////////////////////////// +beyla.ebpf "rinoadocker" { + open_port = "80-65535" + routes { + unmatched = "heauristic" + } + output { + traces = [ + otelcol.connector.servicegraph.tracemetrics.input, + otelcol.connector.spanmetrics.tracemetrics.input, + otelcol.processor.batch.default.input, + otelcol.connector.spanlogs.autologging.input, + ] + } +} + +prometheus.scrape "beyla" { + targets = beyla.ebpf.rinoadocker.targets + forward_to = [prometheus.remote_write.mimir.receiver] +} + +otelcol.auth.headers "tempo" { + header { + key = "Authorization" + value = join(["Basic ", json_path(local.file.endpoints.content, ".traces.basicAuthToken")[0]], "") + } +} + +otelcol.processor.batch "default" { + // Wait until we've received 16K of data. + send_batch_size = 16384 + send_batch_max_size = 16384 + // Or until 2 seconds have elapsed. + timeout = "2s" + // When the Agent has enough batched data, send it to the OpenTelemetry exporter named 'tempo'. + output { + traces = [otelcol.exporter.otlp.tempo.input] + } +} + +otelcol.exporter.otlp "tempo" { + // Define the client for exporting. + client { + // Authentication block. + auth = otelcol.auth.headers.tempo.handler + + // Send to the locally running Tempo instance, on port 4317 (OTLP gRPC). + endpoint = json_path(local.file.endpoints.content, ".traces.url")[0] + + // Configure TLS settings for communicating with the endpoint. + tls { + // The connection is insecure. + insecure = json_path(local.file.endpoints.content, ".traces.tls.insecure")[0] + // Do not verify TLS certificates when connecting. + insecure_skip_verify = json_path(local.file.endpoints.content, ".traces.tls.insecureSkipVerify")[0] + } + } +} + +otelcol.connector.spanlogs "autologging" { + // We only want to output a line for each root span (ie. every single trace), and not for every + // process or span (outputting a line for every span would be extremely verbose). + spans = false + roots = true + processes = false + // We want to ensure that the following three span attributes are included in the log line, if + // present. + span_attributes = [ "http.method", "http.target", "http.status_code" ] + + // Overrides the default key in the log line to be `traceId`, which is then used by Grafana to + // identify the trace ID for correlation with the Tempo datasource. + overrides { + trace_id_key = "traceId" + } + // Send to the OpenTelemetry Loki exporter. + output { + logs = [otelcol.exporter.loki.autologging.input] + } +} + +// Simply forwards the incoming OpenTelemetry log format out as a Loki log. +// We need this stage to ensure we can then process the logline as a Loki object. +otelcol.exporter.loki "autologging" { + forward_to = [loki.process.autologging.receiver] +} + +// The Loki processor allows us to accept a correctly formatted Loki log and mutate it into +// a set of fields for output. +loki.process "autologging" { + // The JSON stage simply extracts the `body` (the actual logline) from the Loki log, ignoring + // all other fields. + stage.json { + expressions = { "body" = "" } + } + // The output stage takes the body (the main logline) and uses this as the source for the output + // logline. In this case, it essentially turns it into logfmt. + stage.output { + source = "body" + } + + // Finally send the processed logline onto the Loki exporter. + forward_to = [loki.write.autologging.receiver] +} + +// The Loki writer receives a processed Loki log and then writes it to a Loki instance. +loki.write "autologging" { + // Add the `agent` value to the `job` label, so we can identify it as having been generated + // by Grafana Agent when querying. + external_labels = { + job = "agent", + } + + // Output the Loki log to the local Loki instance. + endpoint { + url = json_path(local.file.endpoints.content, ".logs.url")[0] + + // The basic auth credentials for the Loki instance. + basic_auth { + username = json_path(local.file.endpoints.content, ".logs.basicAuth.username")[0] + password = json_path(local.file.endpoints.content, ".logs.basicAuth.password")[0] + } + } +} + +// The Tail Sampling processor will use a set of policies to determine which received traces to keep +// and send to Tempo. +otelcol.processor.tail_sampling "errors" { + // Total wait time from the start of a trace before making a sampling decision. Note that smaller time + // periods can potentially cause a decision to be made before the end of a trace has occurred. + decision_wait = "30s" + + // The following policies follow a logical OR pattern, meaning that if any of the policies match, + // the trace will be kept. For logical AND, you can use the `and` policy. Every span of a trace is + // examined by each policy in turn. A match will cause a short-circuit. + + // This policy defines that traces that contain errors should be kept. + policy { + // The name of the policy can be used for logging purposes. + name = "sample-erroring-traces" + // The type must match the type of policy to be used, in this case examing the status code + // of every span in the trace. + type = "status_code" + // This block determines the error codes that should match in order to keep the trace, + // in this case the OpenTelemetry 'ERROR' code. + status_code { + status_codes = [ "ERROR" ] + } + } + + // This policy defines that only traces that are longer than 200ms in total should be kept. + policy { + // The name of the policy can be used for logging purposes. + name = "sample-long-traces" + // The type must match the policy to be used, in this case the total latency of the trace. + type = "latency" + // This block determines the total length of the trace in milliseconds. + latency { + threshold_ms = 200 + } + } + + // The output block forwards the kept traces onto the batch processor, which will marshall them + // for exporting to Tempo. + output { + traces = [otelcol.processor.batch.default.input] + } +} + +// The Spanmetrics Connector will generate RED metrics based on the incoming trace span data. +otelcol.connector.spanmetrics "tracemetrics" { + // The namespace explicit adds a prefix to all the generated span metrics names. + // In this case, we'll ensure they match as closely as possible those generated by Tempo. + namespace = "traces.spanmetrics" + + // Each extra dimension (metrics label) to be added to the generated metrics from matching span attributes. These + // need to be defined with a name and optionally a default value (in the following cases, we do not want a default + // value if the span attribute is not present). + dimension { + name = "http.method" + } + dimension { + name = "http.target" + } + dimension { + name = "http.status_code" + } + dimension { + name = "service.version" + } + + // A histogram block must be present, either explicitly defining bucket values or via an exponential block. + // We do the latter here. + histogram { + explicit { + } + } + + // The exemplar block is added to ensure we generate exemplars for traces on relevant metric values. + exemplars { + enabled = true + } + + // Generated metrics data is in OTLP format. We send this data to the OpenTelemetry Prometheus exporter to ensure + // it gets transformed into Prometheus format data. + output { + metrics = [otelcol.exporter.prometheus.tracemetrics.input] + } +} + +// The Servicegraph Connector will generate service graph metrics (edges and nodes) based on incoming trace spans. +otelcol.connector.servicegraph "tracemetrics" { + // Extra dimensions (metrics labels) to be added to the generated metrics from matching span attributes. + // For this component, this is defined as an array. There are no default values and the labels will not be generated + // for missing span attributes. + dimensions = [ + "http.method", + "http.target", + "http.status_code", + "service.version", + ] + + // Generated metrics data is in OTLP format. We send this data to the OpenTelemetry Prometheus exporter to ensure + // it gets transformed into Prometheus format data. + output { + metrics = [otelcol.exporter.prometheus.tracemetrics.input] + } +} + +otelcol.exporter.prometheus "tracemetrics" { + // Forward to our local Prometheus remote writer which will send the metrics to Mimir. + forward_to = [prometheus.remote_write.mimir.receiver] +} +///////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Profiling +///////////////////////////////////////////////////////////////////////////////////////////////////////////// + +pyroscope.write "pyroscope" { + endpoint { + url = json_path(local.file.endpoints.content, ".profiles.url")[0] + basic_auth { + username = json_path(local.file.endpoints.content, ".profiles.basicAuth.username")[0] + password = json_path(local.file.endpoints.content, ".profiles.basicAuth.password")[0] + } + } + external_labels = {} +} + +pyroscope.ebpf "rinoadocker" { + forward_to = [pyroscope.write.pyroscope.receiver] + targets = discovery.docker.rinoadocker.targets +} \ No newline at end of file