rinoa-docker/ansible/app-configs/grafana/alloy/alloy_config.alloy

{% set vault_addr = 'https://vault.trez.wtf' %}
{% set secrets_path = 'rinoa-docker/env' %}

/////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Agent globals
/////////////////////////////////////////////////////////////////////////////////////////////////////////////
local.file "endpoints" {
    // The endpoints file is used to define the endpoints, credentials and options
    // for the Agent export to.
    filename = "/etc/alloy/endpoints.json"
}

discovery.docker "rinoadocker" {
        host = env("DOCKER_HOST")
}

tracing {
    write_to = [otelcol.exporter.otlp.tempo.input]
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Metrics
/////////////////////////////////////////////////////////////////////////////////////////////////////////////
prometheus.remote_write "mimir" {
  endpoint {
    url = json_path(local.file.endpoints.content, ".metrics.url")[0]
    basic_auth {
        username = json_path(local.file.endpoints.content, ".metrics.basicAuth.username")[0]
        password = json_path(local.file.endpoints.content, ".metrics.basicAuth.password")[0]
    }
  }
}

prometheus.scrape "prometheus" {
        targets = [{
                __address__ = "localhost:12345",
        }]
        forward_to = [prometheus.remote_write.mimir.receiver]
        job_name   = "prometheus"
}

prometheus.exporter.unix "rinoa" {
        procfs_path = "/host/proc"
        sysfs_path  = "/host/sys"
        rootfs_path = "/rootfs"
}

prometheus.scrape "rinoa" {
        targets    = prometheus.exporter.unix.rinoa.targets
        forward_to = [prometheus.remote_write.mimir.receiver]
        job_name   = "rinoa_host"
}

prometheus.exporter.cadvisor "docker" {
        docker_host      = env("DOCKER_HOST")
        storage_duration = "5m"
}

prometheus.scrape "docker" {
        targets    = prometheus.exporter.cadvisor.docker.targets
        forward_to = [prometheus.remote_write.mimir.receiver]
        job_name   = "docker_stats"
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Logging
/////////////////////////////////////////////////////////////////////////////////////////////////////////////
loki.write "loki" {
        endpoint {
                url = json_path(local.file.endpoints.content, ".logs.url")[0]
                basic_auth {
                        username = json_path(local.file.endpoints.content, ".logs.basicAuth.username")[0]
                        password = json_path(local.file.endpoints.content, ".logs.basicAuth.password")[0]
                }
        }
        external_labels = {}
}

loki.source.journal "hostjournal" {
        forward_to = [loki.write.loki.receiver]
        max_age = "24h"
        path = "/rootfs/var/log/journal/"
        labels = {
                job = "host-journal",
        }
}

local.file_match "system" {
        path_targets = [{
                __address__ = "localhost",
                __path__    = "/rootfs/var/log/*log",
                job         = "varlogs",
        }]
}

loki.source.file "system" {
        targets    = local.file_match.system.targets
        forward_to = [loki.write.loki.receiver]
}

loki.source.docker "containers" {
        host       = env("DOCKER_HOST")
        targets    = discovery.docker.rinoadocker.targets
        forward_to = [loki.write.loki.receiver]
        labels     = {
                job = "containerlogs",
        }
}

loki.process "containers" {
        forward_to = [loki.write.loki.receiver]
        // stage.docker {}
        stage.json {
		expressions = {
			attrs  = "",
			output = "log",
			stream = "stream",
		}
	}

	stage.json {
		expressions = {
			tag = "",
		}
		source = "attrs"
	}

	stage.regex {
		expression = "(?P<image_name>(?:[^|]*[^|])).(?P<container_name>(?:[^|]*[^|])).(?P<image_id>(?:[^|]*[^|])).(?P<container_id>(?:[^|]*[^|]))"
		source     = "tag"
	}

	stage.timestamp {
		source = "time"
		format = "RFC3339Nano"
	}

	stage.labels {
		values = {
			container_id   = null,
			container_name = null,
			image_id       = null,
			image_name     = null,
			stream         = null,
			tag            = null,
		}
	}

	stage.output {
		source = "output"
	}
}

/////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Traces
/////////////////////////////////////////////////////////////////////////////////////////////////////////////
beyla.ebpf "rinoadocker" {
    open_port = "80-65535"
    routes {
        unmatched = "heauristic"
    }
    output {
        traces = [
            otelcol.connector.servicegraph.tracemetrics.input,
            otelcol.connector.spanmetrics.tracemetrics.input,
            otelcol.processor.batch.default.input,
            otelcol.connector.spanlogs.autologging.input,
        ]
    }
}

prometheus.scrape "beyla" {
    targets = beyla.ebpf.rinoadocker.targets
    forward_to = [prometheus.remote_write.mimir.receiver]
}

otelcol.auth.headers "tempo" {
    header {
        key = "Authorization"
        value = join(["Basic ", json_path(local.file.endpoints.content, ".traces.basicAuthToken")[0]], "")
    }
}

otelcol.processor.batch "default" {
    // Wait until we've received 16K of data.
    send_batch_size = 16384
    send_batch_max_size = 16384
    // Or until 2 seconds have elapsed.
    timeout = "2s"
    // When the Agent has enough batched data, send it to the OpenTelemetry exporter named 'tempo'.
    output {
        traces = [otelcol.exporter.otlp.tempo.input]
    }
}

otelcol.exporter.otlp "tempo" {
    // Define the client for exporting.
    client {
        // Authentication block.
        auth = otelcol.auth.headers.tempo.handler

        // Send to the locally running Tempo instance, on port 4317 (OTLP gRPC).
        endpoint = json_path(local.file.endpoints.content, ".traces.url")[0]

        // Configure TLS settings for communicating with the endpoint.
        tls {
            // The connection is insecure.
            insecure = json_path(local.file.endpoints.content, ".traces.tls.insecure")[0]
            // Do not verify TLS certificates when connecting.
            insecure_skip_verify = json_path(local.file.endpoints.content, ".traces.tls.insecureSkipVerify")[0]
        }
    }
}

otelcol.connector.spanlogs "autologging" {
    // We only want to output a line for each root span (ie. every single trace), and not for every
    // process or span (outputting a line for every span would be extremely verbose).
    spans = false
    roots = true
    processes = false
    // We want to ensure that the following three span attributes are included in the log line, if
    // present.
    span_attributes = [ "http.method", "http.target", "http.status_code" ]

    // Overrides the default key in the log line to be `traceId`, which is then used by Grafana to
    // identify the trace ID for correlation with the Tempo datasource.
    overrides {
        trace_id_key = "traceId"
    }
    // Send to the OpenTelemetry Loki exporter.
    output {
        logs = [otelcol.exporter.loki.autologging.input]
    }
}

// Simply forwards the incoming OpenTelemetry log format out as a Loki log.
// We need this stage to ensure we can then process the logline as a Loki object.
otelcol.exporter.loki "autologging" {
    forward_to = [loki.process.autologging.receiver]
}

// The Loki processor allows us to accept a correctly formatted Loki log and mutate it into
// a set of fields for output.
loki.process "autologging" {
    // The JSON stage simply extracts the `body` (the actual logline) from the Loki log, ignoring
    // all other fields.
    stage.json {
        expressions = { "body" = "" }
    }
    // The output stage takes the body (the main logline) and uses this as the source for the output
    // logline. In this case, it essentially turns it into logfmt.
    stage.output {
        source = "body"
    }

    // Finally send the processed logline onto the Loki exporter.
    forward_to = [loki.write.autologging.receiver]
}

// The Loki writer receives a processed Loki log and then writes it to a Loki instance.
loki.write "autologging" {
    // Add the `agent` value to the `job` label, so we can identify it as having been generated
    // by Grafana Agent when querying.
    external_labels = {
        job = "agent",
    }

    // Output the Loki log to the local Loki instance.
    endpoint {
        url = json_path(local.file.endpoints.content, ".logs.url")[0]

        // The basic auth credentials for the Loki instance.
        basic_auth {
            username = json_path(local.file.endpoints.content, ".logs.basicAuth.username")[0]
            password = json_path(local.file.endpoints.content, ".logs.basicAuth.password")[0]
        }
    }
}

// The Tail Sampling processor will use a set of policies to determine which received traces to keep
// and send to Tempo.
otelcol.processor.tail_sampling "errors" {
    // Total wait time from the start of a trace before making a sampling decision. Note that smaller time
    // periods can potentially cause a decision to be made before the end of a trace has occurred.
    decision_wait = "30s"

    // The following policies follow a logical OR pattern, meaning that if any of the policies match,
    // the trace will be kept. For logical AND, you can use the `and` policy. Every span of a trace is
    // examined by each policy in turn. A match will cause a short-circuit.

    // This policy defines that traces that contain errors should be kept.
    policy {
        // The name of the policy can be used for logging purposes.
        name = "sample-erroring-traces"
        // The type must match the type of policy to be used, in this case examing the status code
        // of every span in the trace.
        type = "status_code"
        // This block determines the error codes that should match in order to keep the trace,
        // in this case the OpenTelemetry 'ERROR' code.
        status_code {
            status_codes = [ "ERROR" ]
        }
    }

    // This policy defines that only traces that are longer than 200ms in total should be kept.
    policy {
        // The name of the policy can be used for logging purposes.
        name = "sample-long-traces"
        // The type must match the policy to be used, in this case the total latency of the trace.
        type = "latency"
        // This block determines the total length of the trace in milliseconds.
        latency {
            threshold_ms = 200
        }
    }

    // The output block forwards the kept traces onto the batch processor, which will marshall them
    // for exporting to Tempo.
    output {
        traces = [otelcol.processor.batch.default.input]
    }
}

// The Spanmetrics Connector will generate RED metrics based on the incoming trace span data.
otelcol.connector.spanmetrics "tracemetrics" {
    // The namespace explicit adds a prefix to all the generated span metrics names.
    // In this case, we'll ensure they match as closely as possible those generated by Tempo.
    namespace = "traces.spanmetrics"

    // Each extra dimension (metrics label) to be added to the generated metrics from matching span attributes. These
    // need to be defined with a name and optionally a default value (in the following cases, we do not want a default
    // value if the span attribute is not present).
    dimension {
        name = "http.method"
    }
    dimension {
        name = "http.target"
    }
    dimension {
        name = "http.status_code"
    }
    dimension {
        name = "service.version"
    }

    // A histogram block must be present, either explicitly defining bucket values or via an exponential block.
    // We do the latter here.
    histogram {
        explicit {
        }
    }

    // The exemplar block is added to ensure we generate exemplars for traces on relevant metric values.
    exemplars {
        enabled = true
    }

    // Generated metrics data is in OTLP format. We send this data to the OpenTelemetry Prometheus exporter to ensure
    // it gets transformed into Prometheus format data.
    output {
        metrics = [otelcol.exporter.prometheus.tracemetrics.input]
    }
}

// The Servicegraph Connector will generate service graph metrics (edges and nodes) based on incoming trace spans.
otelcol.connector.servicegraph "tracemetrics" {
    // Extra dimensions (metrics labels) to be added to the generated metrics from matching span attributes.
    // For this component, this is defined as an array. There are no default values and the labels will not be generated
    // for missing span attributes.
    dimensions = [
        "http.method",
        "http.target",
        "http.status_code",
        "service.version",
    ]

    // Generated metrics data is in OTLP format. We send this data to the OpenTelemetry Prometheus exporter to ensure
    // it gets transformed into Prometheus format data.
    output {
        metrics = [otelcol.exporter.prometheus.tracemetrics.input]
    }
}

otelcol.exporter.prometheus "tracemetrics" {
    // Forward to our local Prometheus remote writer which will send the metrics to Mimir.
    forward_to = [prometheus.remote_write.mimir.receiver]
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Profiling
/////////////////////////////////////////////////////////////////////////////////////////////////////////////

pyroscope.write "pyroscope" {
        endpoint {
                url = json_path(local.file.endpoints.content, ".profiles.url")[0]
                basic_auth {
                        username = json_path(local.file.endpoints.content, ".profiles.basicAuth.username")[0]
                        password = json_path(local.file.endpoints.content, ".profiles.basicAuth.password")[0]
                }
        }
        external_labels = {}
}

pyroscope.ebpf "rinoadocker" {
        forward_to = [pyroscope.write.pyroscope.receiver]
        targets = discovery.docker.rinoadocker.targets
}