bufstream.yaml#
The bufstream.yaml
file defines configuration for a Bufstream broker. The
Bufstream CLI can be instructed to use the configuration file with the -c
flag.
Annotated bufstream.yaml configuration
# All bufstream.yaml files should have a version. The only currently-valid
# version is v1beta1.
version: v1beta1
# Type Reference:
# - value_1|value_2: A YAML string, either "value_1" or "value_2".
# - <bool>: A YAML boolean value, either true or false.
# - <int>: A YAML number for an integer value.
# - <double>: A YAML number for a floating point value.
# - <string>: A YAML string value.
# - <hostport>: A YAML string containing a hostname or IP address and port,
# separated by `:`.
# - <data-source>: Either an inline YAML string value,
# or a YAML object with the following shape:
# ```
# # A file path to the data relative to the current working directory.
# # Trailing newlines are stripped from the file contents.
# path: <string>
# # An environment variable containing the data.
# env_var: <string>
# # An inline string of the data.
# string: <string>
# # The encoding of the data source value. Defaults to plaintext.
# encoding: plaintext|base64
# ```
# The name of the cluster. All brokers in the same cluster should have the
# same value. This value should not contain sensitive information as it may
# appear in keys, logs, traces, metrics, etc.
cluster: <string>
# The location of the broker, i.e., the datacenter/rack/availability zone
# where the broker is running. If unspecified, the broker will attempt to
# auto-detect the availability zone of the node using host metadata services.
zone: <string>
# Kafka-specific configuration.
kafka:
# Configuration for the Kafka listeners.
#
# By default, a single listener is exposed on localhost:9092 without TLS and no authentication is enabled.
listeners:
# The unique name of the listener (required if more than one listener is defined).
- name: <string>
# The address the Kafka server should listen on.
listen_address: <hostport>
# The address clients should use to connect to the Kafka server, if different
# from `listen_address`. This can be used to ensure clients connect via a
# load balancer or gateway.
advertise_address: <hostport>
# If populated, enables and enforces TLS termination on the Kafka server.
tls:
# Certificates to present to the client. The first certificate compatible
# with the client's requirements is selected automatically.
certificates:
# The PEM-encoded leaf certificate, which may contain intermediate certificates
# following the leaf certificate to form a certificate chain.
- chain: <data-source>
# The PEM-encoded (unencrypted) private key of the certificate chain.
private_key: <data-source>
# Enable TLS-based authentication using Mutual TLS (mTLS).
mtls:
# Requires the use of an mTLS client certificate. Defaults to true if any
# mTLS certificate authorities are specified.
require: <bool>
# The PEM-encoded certificate authorities used by the server to validate
# the client certificates. If set, certificates will be validated, even if
# they are not required.
certificate_authorities:
- <data-source>
# If populated, enables and enforces authentication.
authentication:
sasl:
# Configuration for the PLAIN mechanism.
# See https://datatracker.ietf.org/doc/html/rfc4616.
plain:
# Must have at least one value if SASL is used.
credentials:
# The source of the basicauth username.
- username: <data-source>
# The source of the basicauth password.
password: <data-source>
# Whether to accept ANONYMOUS as a mechanism. Not recommended.
# See https://datatracker.ietf.org/doc/html/rfc4505.
anonymous: <bool>
# Configuration for the SCRAM-* mechanisms.
# See https://datatracker.ietf.org/doc/html/rfc5802.
scram:
# The admin's credentials bootstrapped.
admin_credentials:
username: <data-source>
hash: sha256|sha512
password:
plaintext: <data-source>
salted:
salted_password: <data-source>
salt: <data-source>
iterations: <int>
# Configuration for the OAUTHBEARER mechanism.
oauth_bearer:
# Supported signing algorithms:
# - RS256, RS384, RS512
# - ES256, ES384, ES512
# - HS256, HS384, HS512
# - EdDSA
jwks:
# Static JWKS file or content.
static: <data-source>
# An endpoint serving JWKS that is periodically refreshed.
remote:
# A HTTPS url for the JWKS file
url: <data-source>
# The keys are loaded from the URL once on startup and cached.
# This controls the cache duration.
#
# Defaults to an hour. Set to a negative number to never refresh.
refresh_interval: <duration>
# TLS configuration. If unset, a default configuration is used.
tls:
# Controls whether a client verifies the server's certificate chain and host
# name. If true, the dialer accepts any certificate presented by the server
# and host name in that certificate. In this mode, TLS is susceptible to
# machine-in-the-middle attacks and should only be used for testing.
insecure_skip_verify: <bool>
# The PEM-encoded certificate authorities used by the client to validate
# the server certificates. If empty, the host's root CA set is used.
certificate_authorities:
- <data-source>
# If provided, will match the 'aud' claim to this value.
audience: <string>
# If provided, will match the 'iss' claim to this value.
issuer: <string>
# If set, will use the configured mTLS for authentication.
#
# This acts as a fallback if SASL is also enabled.
mtls:
# Where to extract the principal from the client certificate.
principal_source: anonymous|subject_common_name|san_dns|san_uri
# Configuration for debug features.
debug:
# Address to listen for connections for debug information. If configured,
# pprof and Prometheus exported metrics will be exposed on this address.
listen_address: <hostport>
# Configuration for logging.
#
# By default, logs are produced at the info level.
logging:
# Log level, defaults to info.
level: debug|info|warn|error
# Configuration for metrics.
#
# By default, Prometheus metrics will be exposed at the debug address.
metrics:
# Configuration for exporting OpenTelemetry metrics.
otlp:
# Required: Type of transport to use for OTLP.
type: http|grpc
# Required: URL of OTLP endpoint to export metrics to.
url: <string>
# The labels allowed to be used in metrics collection.
#
# Labels are custom key-value pairs that are added to logs, metrics, and traces.
# Labels can be specified in Kafka client IDs (e.g., "my-client-id;label.foo=bar") or
# in topic configuration. When specifying keys here, do not include the "label." portion.
#
# By default, no labels from client IDs or topic configurations are added to metrics.
include_labels:
# Required: A specific label key to allow in metrics collection. Keys can contain only
# lowercase letters, numeric characters, underscores, and dashes. All characters must
# use UTF-8 encoding, and international characters are allowed. Keys have a minimum
# length of 1 character and a maximum length of 63 characters, and cannot be empty.
- key: <string>
# A list of allowed values for a given key. Values can contain only lowercase letters,
# numeric characters, underscores, and dashes. All characters must use UTF-8 encoding,
# and international characters are allowed. Values can be empty, and have a maximum
# length of 63 characters.
#
# By default, all values are accepted.
values:
- <string>
# Configuration for traces.
#
# By default, no traces are exported.
traces:
# Configuration for exporting OpenTelemetry-based traces.
otlp:
# Required: Type of transport to use for OTLP.
type: http|grpc
# Required: URL of OTLP endpoint to export traces to.
url: <string>
# OpenTelemetry trace sample ratio, defaults to 0.1.
trace_ratio: <double>
# Configuration for metadata storage.
#
# Only one top-level key within metadata can be specified.
#
# Exactly one top-level key is required.
#
# By default, an in-memory implementation is used.
metadata:
# If specified, the broker will use etcd as the metadata storage of the cluster.
etcd:
# The etcd node addresses.
#
# Currently, Bufstream assumes no path-prefix when connecting to
# the etcd cluster.
#
# If no addresses are specified, an embedded etcd server will be used. This
# is only suitable for testing.
addresses:
- <hostport>
# If specified, the broker will use PostgreSQL as the metadata storage of the cluster,
# using the Data Source Name or database URI provided by the data source to connect to.
postgres: <data-source>
# Some connections (such as Cloud SQL) require additional options. The following form,
# with `dsn` as a key, is also accepted:
postgres:
# Required: Data Source Name or database URL of PostgreSQL server to connect to.
dsn: <data-source>
# Configuration for connecting to a Google Cloud SQL PostgreSQL instance.
cloud_sql:
# Required: ICN is the Cloud SQL instance's connection name, typically in
# the format "project-name:region:instance-name".
icn: <data-source>
# Use IAM auth to connect to the Cloud SQL database.
iam: <bool>
# Use private IP to connect to the Cloud SQL database.
private_ip: <bool>
# Configuration settings for the database connection pool.
pool:
# The maximum size of the connection pool. Defaults to 20.
max_connections: <int>
# The minimum size of the connection pool. Defaults to 0.
min_connections: <int>
# If specified, the broker will use Google Cloud Spanner as the metadata storage
# of the cluster.
spanner:
# Required: The Spanner project ID.
project_id: <string>
# Required: The Spanner instance ID.
instance_id: <string>
# Required: The Spanner database name.
database_name: <string>
# Configuration for data storage.
#
# By default, data will be stored in-memory, unless a durable metadata storage provider is
# selected, in which case data will be stored on the local filesystem, at
# `$HOME/.local/share/bufstream` on Linux and `%LocalAppData%\bufstream` on Windows.
data:
# If specified, use an Amazon S3-compatible storage provider with the given storage bucket
# and prefix, e.g. `s3://my-bucket/my-prefix/`
s3: <string>
# Additional options can be specified. If so, the storage bucket and prefix will instead be
# specified under the `uri` key.
s3:
# URI of the storage bucket and prefix, e.g. `s3://my-bucket/my-prefix/`
uri: <string>
# The region in which the bucket exists. It is necessary to specify this if
# the broker is in a different region than the bucket, or if the broker is
# running outside of AWS.
region: <string>
# The endpoint to connect to. If specified, overrides the S3-compatible
# endpoint that Bufstream will connect to. This is necessary if using an
# S3-compatible provider other than AWS, or when connecting to special S3
# endpoints such as those provided for GovCloud or FIPS compliance.
endpoint: <string>
# Access key ID to use. If set, `secret_access_key` must also be set.
access_key_id: <data-source>
# Secret access key ID to use for authentication. If set, `access_key_id`
# must also be set.
secret_access_key: <data-source>
# Use path-style requests instead of virtual-hosted–style requests.
# Most S3 providers use virtual hosting, but some configurations
# require path-style requests, such as with Minio.
force_path_style: <bool>
# If specified, use Google Cloud Storage with the provided storage bucket URI,
# e.g. `gs://my-bucket/my-prefix/`
gcs: <string>
# If specified, use Azure Blob Storage with the provided container URI,
# e.g. `https://myaccount.blob.core.windows.net/mycontainer/myprefix`
azure: <string>
# Additional options can be specified. If so, the container URI will instead be
# specified under the `uri` key.
azure:
# Container URI to connect to, e.g. `https://myaccount.blob.core.windows.net/mycontainer/myprefix`
uri: <string>
# Access key ID to use. If set, `secret_access_key` must also be set.
access_key_id: <data-source>
# Secret access key ID to use for authentication. If set, `access_key_id`
# must also be set.
secret_access_key: <data-source>
# The schema registry used for data enforcement.
schema_registry:
# Confluent Schema Registry
confluent:
# Root URL (including protocol and any required path prefix) of the CSR API.
url: <string>
# Name of the CSR instance within the BSR. This name is used to disambiguate
# subjects of the same name within the same schema file. Used exclusively
# for schema coercion.
instance_name: <string>
# TLS configuration. If unset and the url field specifies https, a default
# configuration is used.
tls:
# Controls whether a client verifies the server's certificate chain and host
# name. If true, the dialer accepts any certificate presented by the server
# and host name in that certificate. In this mode, TLS is susceptible to
# machine-in-the-middle attacks and should only be used for testing.
insecure_skip_verify: <bool>
# The PEM-encoded certificate authorities used by the client to validate
# the server certificates. If empty, the host's root CA set is used.
certificate_authorities:
- <data-source>
# Authentication to use for the registry. At most one method may be specified.
authentication:
# Authenticate against the CSR API using basic auth credentials.
basic_auth:
# The source of the basicauth username.
username: <data-source>
# The source of the basicauth password.
password: <data-source>
# Configuration for Iceberg integration, for exposing Kafka topics as tables
# in Apache Iceberg v2 format.
#
# By default, Iceberg integration is disabled.
iceberg:
# Name of this catalog, used to disambiguate multiple catalogs used across
# topics and tables.
- name: <string>
# REST catalog. Valid table names must be in the form "namespace.table". The
# namespace may contain multiple components such as "ns1.ns2.ns3.table". The
# underlying catalog implementation that provides the REST API may impose
# further constraints on table and namespace naming.
#
# Also see
# https://github.com/apache/iceberg/blob/main/open-api/rest-catalog-open-api.yaml
rest:
# Root URL (including protocol and any required path prefix) of the catalog server.
url: <string>
# Optional URI prefix. This is separate from any URI prefix present in `url`. This
# prefix appears after the "/v1/" API path component but before the remainder of
# the URI path.
uri_prefix: <string>
# Optional warehouse location. Some REST catalogs require this property in the
# client's initial configuration requests.
warehouse: <string>
# TLS configuration. If unset and the url field specifies https, a default
# configuration is used.
tls:
# Controls whether a client verifies the server's certificate chain and host
# name. If true, the dialer accepts any certificate presented by the server
# and host name in that certificate. In this mode, TLS is susceptible to
# machine-in-the-middle attacks and should only be used for testing.
insecure_skip_verify: <bool>
# The PEM-encoded certificate authorities used by the client to validate
# the server certificates. If empty, the host's root CA set is used.
certificate_authorities:
- <data-source>
# Authentication to use for the catalog. At most one method may be specified.
authentication:
# Authenticate against the Iceberg catalog using basic auth credentials.
basic_auth:
# The source of the basicauth username.
username: <data-source>
# The source of the basicauth password.
password: <data-source>
# Authenticate against the Iceberg catalog with the given static bearer token
# (which could be a long-lived OAuth2 token).
bearer_token: <data-source>
# Authenticate against the Iceberg catalog with the given OAuth2 configuration.
oauth2:
# The URL of the token endpoint, used to provision access tokens for use with
# requests to the catalog. If not specified, this defaults to the catalog's
# base URL with "v1/oauth/tokens" appended to the URI path, which matches the
# URI of the endpoint as specified in the Iceberg Catalog's OpenAPI spec.
token_endpoint_url: <string>
# The scope to request when provisioning an access token. If not specified,
# defaults to "catalog".
scope: <string>
# The credentials used to authenticate to the token endpoint.
client_id: <data-source>
# The credentials used to authenticate to the token endpoint.
client_secret: <data-source>
# Optional alternate TLS configuration for the token endpoint. If not
# specified, accessing the token endpoint will use the same TLS configuration
# as used for accessing other REST catalog endpoints.
# (See RESTCatalogConfig.tls).
tls:
# Controls whether a client verifies the server's certificate chain and host
# name. If true, the dialer accepts any certificate presented by the server
# and host name in that certificate. In this mode, TLS is susceptible to
# machine-in-the-middle attacks and should only be used for testing.
insecure_skip_verify: <bool>
# The PEM-encoded certificate authorities used by the client to validate
# the server certificates. If empty, the host's root CA set is used.
certificate_authorities:
- <data-source>
# Google Cloud BigQuery Metastore. Valid table names must be in the form
# "dataset.table".
bigquery_metastore:
# The GCP project of the BigQuery Metastore. If empty, this is assumed to be the
# current project in which the bufstream workload is running.
project: <string>
# The location for any BigQuery datasets that are created. Must be present if
# cloud_resource_connection is present. Otherwise, if absent, datasets cannot be
# auto-created, so any dataset referenced by an Iceberg table name must already
# exist.
location: <string>
# The name of a BigQuery Cloud Resource connection. This is only the simple name
# of the connection, not the full name. Since a BigQuery dataset can only use
# connections in the same project and location, the full connection name (which
# includes its project and location) is not necessary.
#
# If absent, no override connection will be associated with created tables.
cloud_resource_connection: <string>
# AWS Glue Data Catalog. Valid table names must be in the form
# "database.table".
aws_glue_data_catalog:
# The AWS account ID of the AWS Glue catalog.
#
# This is normally not necessary as it defaults to the account ID for the
# IAM user of the workload. But if the workload's credentials are not those
# of an IAM user or if the Glue catalog is defined in a different AWS
# account, then this must be specified.
aws_account_id: <string>
# The AWS region to indicate in the credential scope of the signature.
#
# This field defaults to the region of the broker's host.
region: <string>
# Specifies the AWS access key ID for authentication to the resource.
#
# By default, authentication is performed using the metadata service of the
# broker's host. If set, `secret_access_key` must also be provided.
access_key_id: <data-source>
# Specifies the AWS secret access key for authentication to the resource.
#
# By default, authentication is performed using the metadata service of the
# broker's host. If set, `access_key_id` must also be provided.
secret_access_key: <data-source>
# Specifies the AWS session token when using AWS temporary credentials to
# access the cloud resource. Omit when not using temporary credentials.
#
# Temporary credentials are not recommended for production workloads, but
# can be useful in development and test environments to authenticate local
# processes with remote AWS resources.
#
# This value should only be present when `access_key_id` and
# `secret_access_key` are also set.
session_token: <data-source>
# Configuration for the admin RPC interface.
#
# By default, admin RPCs are listened on localhost:9089.
admin:
# The address to listen on for Admin RPCs.
listen_address: <hostport>
# If populated, enables and enforces TLS termination on the admin interface.
tls:
# Certificates to present to the client. The first certificate compatible
# with the client's requirements is selected automatically.
certificates:
# The PEM-encoded leaf certificate, which may contain intermediate certificates
# following the leaf certificate to form a certificate chain.
- chain: <data-source>
# The PEM-encoded (unencrypted) private key of the certificate chain.
private_key: <data-source>
# Enable TLS-based authentication using Mutual TLS (mTLS).
mtls:
# Requires the use of an mTLS client certificate. Defaults to true if any
# mTLS certificate authorities are specified.
require: <bool>
# The PEM-encoded certificate authorities used by the server to validate
# the client certificates. If set, certificates will be validated, even if
# they are not required.
certificate_authorities:
- <data-source>