diff --git a/docs/modules/airflow/examples/example-airflow-kubernetes-executor-s3-logging.yaml b/docs/modules/airflow/examples/example-airflow-kubernetes-executor-s3-logging.yaml new file mode 100644 index 00000000..0c88c900 --- /dev/null +++ b/docs/modules/airflow/examples/example-airflow-kubernetes-executor-s3-logging.yaml @@ -0,0 +1,24 @@ +apiVersion: airflow.stackable.tech/v1alpha1 +kind: AirflowCluster +metadata: + name: airflow +spec: + image: + productVersion: 2.9.3 + clusterConfig: {} + webservers: + envOverrides: &s3-logging-env-overrides + AIRFLOW_LOGGING_REMOTE_LOGGING: "True" + AIRFLOW_LOGGING_REMOTE_BASE_LOG_FOLDER: s3:///airflow-task-logs/ + # The name / connection ID created in the Airflow Web UI + AIRFLOW_LOGGING_REMOTE_LOG_CONN_ID: minio + roleGroups: + default: + replicas: 1 + schedulers: + envOverrides: *s3-logging-env-overrides + roleGroups: + default: + replicas: 1 + kubernetesExecutors: + envOverrides: *s3-logging-env-overrides diff --git a/docs/modules/airflow/images/airflow_dag_s3_logs.png b/docs/modules/airflow/images/airflow_dag_s3_logs.png new file mode 100644 index 00000000..52523347 Binary files /dev/null and b/docs/modules/airflow/images/airflow_dag_s3_logs.png differ diff --git a/docs/modules/airflow/images/airflow_edit_s3_connection.png b/docs/modules/airflow/images/airflow_edit_s3_connection.png new file mode 100644 index 00000000..52637212 Binary files /dev/null and b/docs/modules/airflow/images/airflow_edit_s3_connection.png differ diff --git a/docs/modules/airflow/pages/usage-guide/using-kubernetes-executors.adoc b/docs/modules/airflow/pages/usage-guide/using-kubernetes-executors.adoc index 1ba069a5..c5771752 100644 --- a/docs/modules/airflow/pages/usage-guide/using-kubernetes-executors.adoc +++ b/docs/modules/airflow/pages/usage-guide/using-kubernetes-executors.adoc @@ -3,6 +3,8 @@ Instead of using the Celery workers you can let Airflow run the tasks using Kubernetes executors, where Pods are created dynamically as needed without jobs being routed through a Redis queue to the workers. +== Kubernetes Executor configuration + To achieve this, swap `spec.celeryExecutors` with `spec.kubernetesExecutors`. E.g. you would change the following example @@ -28,3 +30,40 @@ spec: resources: # ... ---- + +== Logging + +Kubernetes Executors and their respective Pods only live as long as the task they are executing. +Afterwards the Pod is immediately terminated and e.g. console output or logs are gone. + +In order to persist task logs, Airflow can be configured to store its https://airflow.apache.org/docs/apache-airflow-providers-cncf-kubernetes/stable/kubernetes_executor.html#managing-dags-and-logs[executor logs on disk (PV)] or as described in the following section on S3. + +=== Airflow Web UI + +In the Airflow Web UI, click on `Admin` -> `Connections` -> `Add a new record` (the plus). +Then enter your MinIO host and credentials as shown. + +image::airflow_edit_s3_connection.png[Airflow connection menu] + +The name or connection ID is `minio`, the type is `Amazon Web Services`, the `AWS Access Key ID` and `AWS Secret Access Key` are filled with the S3 credentials. +The `Extra` field contains the endpoint URL like: + +[source,json] +---- +{ + "endpoint_url": "http://minio.default.svc.cluster.local:9000" +} +---- + +=== Executor configuration + +In order to configure the S3 logging, you need to add the following environment variables to the Airflow cluster definition: + +[source,yaml] +---- +include::example$example-airflow-kubernetes-executor-s3-logging.yaml[] +---- + +Now you should be able to fetch and inspect logs in the Airflow Web UI from S3 for each DAG run. + +image::airflow_dag_s3_logs.png[Airflow DAG S3 logs]