bufstream.yaml#

The bufstream.yaml file defines configuration for a Bufstream broker. The Bufstream CLI can be instructed to use the configuration file with the -c flag.
Annotated bufstream.yaml configuration
# All bufstream.yaml files should have a version. The only currently-valid
# version is v1beta1.
version: v1beta1

# Type Reference:
# - value_1|value_2: A YAML string, either "value_1" or "value_2".
# - <bool>: A YAML boolean value, either true or false.
# - <int>: A YAML number for an integer value.
# - <double>: A YAML number for a floating point value.
# - <string>: A YAML string value.
# - <hostport>: A YAML string containing a hostname or IP address and port,
#               separated by `:`.
# - <data-source>: Either an inline YAML string value,
#                  or a YAML object with the following shape:
#   ```
#   # A file path to the data relative to the current working directory.
#   # Trailing newlines are stripped from the file contents.
#   path: <string>
#   # An environment variable containing the data.
#   env_var: <string>
#   # An inline string of the data.
#   string: <string>
#   # The encoding of the data source value. Defaults to plaintext.
#   encoding: plaintext|base64
#   ```

# The name of the cluster. All brokers in the same cluster should have the
# same value. This value should not contain sensitive information as it may
# appear in keys, logs, traces, metrics, etc.
cluster: <string>

# The location of the broker, i.e., the datacenter/rack/availability zone
# where the broker is running. If unspecified, the broker will attempt to
# auto-detect the availability zone of the node using host metadata services.
zone: <string>

# Kafka-specific configuration.
kafka:
  # Configuration for the Kafka listeners.
  #
  # By default, a single listener is exposed on localhost:9092 without TLS and no authentication is enabled.
  listeners:
    # The unique name of the listener (required if more than one listener is defined).
    - name: <string>
      # The address the Kafka server should listen on.
      listen_address: <hostport>
      # The address clients should use to connect to the Kafka server, if different
      # from `listen_address`. This can be used to ensure clients connect via a
      # load balancer or gateway.
      advertise_address: <hostport>
      # If populated, enables and enforces TLS termination on the Kafka server.
      tls:
        # Certificates to present to the client. The first certificate compatible
        # with the client's requirements is selected automatically.
        certificates:
          # The PEM-encoded leaf certificate, which may contain intermediate certificates
          # following the leaf certificate to form a certificate chain.
          - chain: <data-source>
            # The PEM-encoded (unencrypted) private key of the certificate chain.
            private_key: <data-source>
        # Enable TLS-based authentication using Mutual TLS (mTLS).
        mtls:
          # Requires the use of an mTLS client certificate. Defaults to true if any
          # mTLS certificate authorities are specified.
          require: <bool>
          # The PEM-encoded certificate authorities used by the server to validate
          # the client certificates. If set, certificates will be validated, even if
          # they are not required.
          certificate_authorities:
            - <data-source>
      # If populated, enables and enforces authentication.
      authentication:
        sasl:
          # Configuration for the PLAIN mechanism.
          # See https://datatracker.ietf.org/doc/html/rfc4616.
          plain:
            # Must have at least one value if SASL is used.
            credentials:
              # The source of the basicauth username.
              - username: <data-source>
                # The source of the basicauth password.
                password: <data-source>
          # Whether to accept ANONYMOUS as a mechanism. Not recommended.
          # See https://datatracker.ietf.org/doc/html/rfc4505.
          anonymous: <bool>
          # Configuration for the SCRAM-* mechanisms.
          # See https://datatracker.ietf.org/doc/html/rfc5802.
          scram:
            # The admin's credentials bootstrapped.
            admin_credentials:
              username: <data-source>
              hash: sha256|sha512
              password:
                plaintext: <data-source>
                salted:
                  salted_password: <data-source>
                  salt: <data-source>
                  iterations: <int>
          # Configuration for the OAUTHBEARER mechanism.
          oauth_bearer:
            # Supported signing algorithms:
            # - RS256, RS384, RS512
            # - ES256, ES384, ES512
            # - HS256, HS384, HS512
            # - EdDSA
            jwks:
              # Static JWKS file or content.
              static: <data-source>
              # An endpoint serving JWKS that is periodically refreshed.
              remote:
                # A HTTPS url for the JWKS file
                url: <data-source>
                # The keys are loaded from the URL once on startup and cached.
                # This controls the cache duration.
                #
                # Defaults to an hour. Set to a negative number to never refresh.
                refresh_interval: <duration>
                # TLS configuration. If unset, a default configuration is used.
                tls:
                  # Controls whether a client verifies the server's certificate chain and host
                  # name. If true, the dialer accepts any certificate presented by the server
                  # and host name in that certificate. In this mode, TLS is susceptible to
                  # machine-in-the-middle attacks and should only be used for testing.
                  insecure_skip_verify: <bool>
                  # The PEM-encoded certificate authorities used by the client to validate
                  # the server certificates. If empty, the host's root CA set is used.
                  certificate_authorities:
                    - <data-source>
            # If provided, will match the 'aud' claim to this value.
            audience: <string>
            # If provided, will match the 'iss' claim to this value.
            issuer: <string>
        # If set, will use the configured mTLS for authentication.
        #
        # This acts as a fallback if SASL is also enabled.
        mtls:
          # Where to extract the principal from the client certificate.
          principal_source: anonymous|subject_common_name|san_dns|san_uri

# Configuration for debug features.
debug:
  # Address to listen for connections for debug information. If configured,
  # pprof and Prometheus exported metrics will be exposed on this address.
  listen_address: <hostport>

# Configuration for logging.
#
# By default, logs are produced at the info level.
logging:
  # Log level, defaults to info.
  level: debug|info|warn|error

# Configuration for metrics.
#
# By default, Prometheus metrics will be exposed at the debug address.
metrics:
  # Configuration for exporting OpenTelemetry metrics.
  otlp:
    # Required: Type of transport to use for OTLP.
    type: http|grpc
    # Required: URL of OTLP endpoint to export metrics to.
    url: <string>
  # The labels allowed to be used in metrics collection.
  #
  # Labels are custom key-value pairs that are added to logs, metrics, and traces.
  # Labels can be specified in Kafka client IDs (e.g., "my-client-id;label.foo=bar") or
  # in topic configuration. When specifying keys here, do not include the "label." portion.
  #
  # By default, no labels from client IDs or topic configurations are added to metrics.
  include_labels:
      # Required: A specific label key to allow in metrics collection.  Keys can contain only
      # lowercase letters, numeric characters, underscores, and dashes. All characters must
      # use UTF-8 encoding, and international characters are allowed. Keys have a minimum
      # length of 1 character and a maximum length of 63 characters, and cannot be empty.
    - key: <string>
      # A list of allowed values for a given key.  Values can contain only lowercase letters,
      # numeric characters, underscores, and dashes. All characters must use UTF-8 encoding,
      # and international characters are allowed. Values can be empty, and have a maximum
      # length of 63 characters.
      #
      # By default, all values are accepted.
      values:
        - <string>

# Configuration for traces.
#
# By default, no traces are exported.
traces:
  # Configuration for exporting OpenTelemetry-based traces.
  otlp:
    # Required: Type of transport to use for OTLP.
    type: http|grpc
    # Required: URL of OTLP endpoint to export traces to.
    url: <string>
    # OpenTelemetry trace sample ratio, defaults to 0.1.
    trace_ratio: <double>

# Configuration for metadata storage.
#
# Only one top-level key within metadata can be specified.
#
# Exactly one top-level key is required.
#
# By default, an in-memory implementation is used.
metadata:
  # If specified, the broker will use etcd as the metadata storage of the cluster.
  etcd:
    # The etcd node addresses.
    #
    # Currently, Bufstream assumes no path-prefix when connecting to
    # the etcd cluster.
    #
    # If no addresses are specified, an embedded etcd server will be used. This
    # is only suitable for testing.
    addresses:
      - <hostport>
  # If specified, the broker will use PostgreSQL as the metadata storage of the cluster,
  # using the Data Source Name or database URI provided by the data source to connect to.
  postgres: <data-source>
  # Some connections (such as Cloud SQL) require additional options. The following form,
  # with `dsn` as a key, is also accepted:
  postgres:
    # Required: Data Source Name or database URL of PostgreSQL server to connect to.
    dsn: <data-source>
    # Configuration for connecting to a Google Cloud SQL PostgreSQL instance.
    cloud_sql:
      # Required: ICN is the Cloud SQL instance's connection name, typically in
      # the format "project-name:region:instance-name".
      icn: <data-source>
      # Use IAM auth to connect to the Cloud SQL database.
      iam: <bool>
      # Use private IP to connect to the Cloud SQL database.
      private_ip: <bool>
    # Configuration settings for the database connection pool.
    pool:
      # The maximum size of the connection pool. Defaults to 20.
      max_connections: <int>
      # The minimum size of the connection pool. Defaults to 0.
      min_connections: <int>
  # If specified, the broker will use Google Cloud Spanner as the metadata storage
  # of the cluster.
  spanner:
    # Required: The Spanner project ID.
    project_id: <string>
    # Required: The Spanner instance ID.
    instance_id: <string>
    # Required: The Spanner database name.
    database_name: <string>

# Configuration for data storage.
#
# By default, data will be stored in-memory, unless a durable metadata storage provider is
# selected, in which case data will be stored on the local filesystem, at
# `$HOME/.local/share/bufstream` on Linux and `%LocalAppData%\bufstream` on Windows.
data:
  # If specified, use an Amazon S3-compatible storage provider with the given storage bucket
  # and prefix, e.g. `s3://my-bucket/my-prefix/`
  s3: <string>
  # Additional options can be specified. If so, the storage bucket and prefix will instead be
  # specified under the `uri` key.
  s3:
    # URI of the storage bucket and prefix, e.g. `s3://my-bucket/my-prefix/`
    uri: <string>
    # The region in which the bucket exists. It is necessary to specify this if
    # the broker is in a different region than the bucket, or if the broker is
    # running outside of AWS.
    region: <string>
    # The endpoint to connect to. If specified, overrides the S3-compatible
    # endpoint that Bufstream will connect to. This is necessary if using an
    # S3-compatible provider other than AWS, or when connecting to special S3
    # endpoints such as those provided for GovCloud or FIPS compliance.
    endpoint: <string>
    # Access key ID to use. If set, `secret_access_key` must also be set.
    access_key_id: <data-source>
    # Secret access key ID to use for authentication. If set, `access_key_id`
    # must also be set.
    secret_access_key: <data-source>
    # Use path-style requests instead of virtual-hosted–style requests.
    # Most S3 providers use virtual hosting, but some configurations
    # require path-style requests, such as with Minio.
    force_path_style: <bool>
  # If specified, use Google Cloud Storage with the provided storage bucket URI,
  # e.g. `gs://my-bucket/my-prefix/`
  gcs: <string>
  # If specified, use Azure Blob Storage with the provided container URI,
  # e.g. `https://myaccount.blob.core.windows.net/mycontainer/myprefix`
  azure: <string>
  # Additional options can be specified. If so, the container URI will instead be
  # specified under the `uri` key.
  azure:
    # Container URI to connect to, e.g. `https://myaccount.blob.core.windows.net/mycontainer/myprefix`
    uri: <string>
    # Access key ID to use. If set, `secret_access_key` must also be set.
    access_key_id: <data-source>
    # Secret access key ID to use for authentication. If set, `access_key_id`
    # must also be set.
    secret_access_key: <data-source>

# The schema registry used for data enforcement. Only one configured schema registry
# is allowed.
schema_registry:
  # Buf Schema Registry
  bsr:
    # Hostname of Buf Schema Registry (example: 'buf.build'). Required.
    host: <string>
    # API token used to authenticate to the BSR. It is recommended to use a bot user's token
    # (https://buf.build/docs/bsr/admin/instance/bot-users/).
    token: <data-source>
    # TLS configuration. If unset, a default configuration is used.
    tls:
      # Controls whether a client verifies the server's certificate chain and host
      # name. If true, the dialer accepts any certificate presented by the server
      # and host name in that certificate. In this mode, TLS is susceptible to
      # machine-in-the-middle attacks and should only be used for testing.
      insecure_skip_verify: <bool>
      # The PEM-encoded certificate authorities used by the client to validate
      # the server certificates. If empty, the host's root CA set is used.
      certificate_authorities:
        - <data-source>

  # Confluent Schema Registry
  confluent:
    # Root URL (including protocol and any required path prefix) of the CSR API.
    url: <string>
    # Name of the CSR instance within the BSR. This name is used to disambiguate
    # subjects of the same name within the same schema file. Used exclusively
    # for schema coercion.
    instance_name: <string>
    # TLS configuration. If unset and the url field specifies https, a default
    # configuration is used.
    tls:
      # Controls whether a client verifies the server's certificate chain and host
      # name. If true, the dialer accepts any certificate presented by the server
      # and host name in that certificate. In this mode, TLS is susceptible to
      # machine-in-the-middle attacks and should only be used for testing.
      insecure_skip_verify: <bool>
      # The PEM-encoded certificate authorities used by the client to validate
      # the server certificates. If empty, the host's root CA set is used.
      certificate_authorities:
        - <data-source>
    # Authentication to use for the registry. At most one method may be specified.
    authentication:
      # Authenticate against the CSR API using basic auth credentials.
      basic_auth:
        # The source of the basicauth username.
        username: <data-source>
        # The source of the basicauth password.
        password: <data-source>

# Configuration for Iceberg integration, for exposing Kafka topics as tables
# in Apache Iceberg v2 format.
#
# By default, Iceberg integration is disabled.
iceberg:
    # Name of this catalog, used to disambiguate multiple catalogs used across
    # topics and tables.
  - name: <string>
    # REST catalog. Valid table names must be in the form "namespace.table". The
    # namespace may contain multiple components such as "ns1.ns2.ns3.table". The
    # underlying catalog implementation that provides the REST API may impose
    # further constraints on table and namespace naming.
    #
    # Also see
    # https://github.com/apache/iceberg/blob/main/open-api/rest-catalog-open-api.yaml
    rest:
      # Root URL (including protocol and any required path prefix) of the catalog server.
      url: <string>
      # Optional URI prefix. This is separate from any URI prefix present in `url`. This
      # prefix appears after the "/v1/" API path component but before the remainder of
      # the URI path.
      uri_prefix: <string>
      # Optional warehouse location. Some REST catalogs require this property in the
      # client's initial configuration requests.
      warehouse: <string>
      # TLS configuration. If unset and the url field specifies https, a default
      # configuration is used.
      tls:
        # Controls whether a client verifies the server's certificate chain and host
        # name. If true, the dialer accepts any certificate presented by the server
        # and host name in that certificate. In this mode, TLS is susceptible to
        # machine-in-the-middle attacks and should only be used for testing.
        insecure_skip_verify: <bool>
        # The PEM-encoded certificate authorities used by the client to validate
        # the server certificates. If empty, the host's root CA set is used.
        certificate_authorities:
          - <data-source>
      # Authentication to use for the catalog. At most one method may be specified.
      authentication:
        # Authenticate against the Iceberg catalog using basic auth credentials.
        basic_auth:
          # The source of the basicauth username.
          username: <data-source>
          # The source of the basicauth password.
          password: <data-source>
        # Authenticate against the Iceberg catalog with the given static bearer token
        # (which could be a long-lived OAuth2 token).
        bearer_token: <data-source>
        # Authenticate against the Iceberg catalog with the given OAuth2 configuration.
        oauth2:
          # The URL of the token endpoint, used to provision access tokens for use with
          # requests to the catalog. If not specified, this defaults to the catalog's
          # base URL with "v1/oauth/tokens" appended to the URI path, which matches the
          # URI of the endpoint as specified in the Iceberg Catalog's OpenAPI spec.
          token_endpoint_url: <string>
          # The scope to request when provisioning an access token. If not specified,
          # defaults to "catalog".
          scope: <string>
          # The credentials used to authenticate to the token endpoint.
          client_id: <data-source>
          # The credentials used to authenticate to the token endpoint.
          client_secret: <data-source>
          # Optional alternate TLS configuration for the token endpoint. If not
          # specified, accessing the token endpoint will use the same TLS configuration
          # as used for accessing other REST catalog endpoints.
          # (See RESTCatalogConfig.tls).
          tls:
            # Controls whether a client verifies the server's certificate chain and host
            # name. If true, the dialer accepts any certificate presented by the server
            # and host name in that certificate. In this mode, TLS is susceptible to
            # machine-in-the-middle attacks and should only be used for testing.
            insecure_skip_verify: <bool>
            # The PEM-encoded certificate authorities used by the client to validate
            # the server certificates. If empty, the host's root CA set is used.
            certificate_authorities:
              - <data-source>
    # Google Cloud BigQuery Metastore. Valid table names must be in the form
    # "dataset.table".
    bigquery_metastore:
      # The GCP project of the BigQuery Metastore. If empty, this is assumed to be the
      # current project in which the bufstream workload is running.
      project: <string>
      # The location for any BigQuery datasets that are created. Must be present if
      # cloud_resource_connection is present. Otherwise, if absent, datasets cannot be
      # auto-created, so any dataset referenced by an Iceberg table name must already
      # exist.
      location: <string>
      # The name of a BigQuery Cloud Resource connection. This is only the simple name
      # of the connection, not the full name. Since a BigQuery dataset can only use
      # connections in the same project and location, the full connection name (which
      # includes its project and location) is not necessary.
      #
      # If absent, no override connection will be associated with created tables.
      cloud_resource_connection: <string>
    # AWS Glue Data Catalog. Valid table names must be in the form
    # "database.table".
    aws_glue_data_catalog:
      # The AWS account ID of the AWS Glue catalog.
      #
      # This is normally not necessary as it defaults to the account ID for the
      # IAM user of the workload. But if the workload's credentials are not those
      # of an IAM user or if the Glue catalog is defined in a different AWS
      # account, then this must be specified.
      aws_account_id: <string>
      # The AWS region to indicate in the credential scope of the signature.
      #
      # This field defaults to the region of the broker's host.
      region: <string>
      # Specifies the AWS access key ID for authentication to the resource.
      #
      # By default, authentication is performed using the metadata service of the
      # broker's host. If set, `secret_access_key` must also be provided.
      access_key_id: <data-source>
      # Specifies the AWS secret access key for authentication to the resource.
      #
      # By default, authentication is performed using the metadata service of the
      # broker's host. If set, `access_key_id` must also be provided.
      secret_access_key: <data-source>
      # Specifies the AWS session token when using AWS temporary credentials to
      # access the cloud resource. Omit when not using temporary credentials.
      #
      # Temporary credentials are not recommended for production workloads, but
      # can be useful in development and test environments to authenticate local
      # processes with remote AWS resources.
      #
      # This value should only be present when `access_key_id` and
      # `secret_access_key` are also set.
      session_token: <data-source>

# Configuration for the admin RPC interface.
#
# By default, admin RPCs are listened on localhost:9089.
admin:
  # The address to listen on for Admin RPCs.
  listen_address: <hostport>
  # If populated, enables and enforces TLS termination on the admin interface.
  tls:
    # Certificates to present to the client. The first certificate compatible
    # with the client's requirements is selected automatically.
    certificates:
        # The PEM-encoded leaf certificate, which may contain intermediate certificates
        # following the leaf certificate to form a certificate chain.
      - chain: <data-source>
        # The PEM-encoded (unencrypted) private key of the certificate chain.
        private_key: <data-source>
    # Enable TLS-based authentication using Mutual TLS (mTLS).
    mtls:
      # Requires the use of an mTLS client certificate. Defaults to true if any
      # mTLS certificate authorities are specified.
      require: <bool>
      # The PEM-encoded certificate authorities used by the server to validate
      # the client certificates. If set, certificates will be validated, even if
      # they are not required.
      certificate_authorities:
        - <data-source>