docker(도커) 및 쿠버네티스

(airflow) docker-compose.yaml 파일 심층 분석

데이터왕 2024. 1. 8. 16:32

# CeleryExecutor와 Redis, PostgreSQL을 사용하여 기본적인 Airflow 클러스터 구성입니다.
# 이 구성은 환경 변수 또는 .env 파일을 사용한 기본 구성을 지원합니다.
# 다음 변수가 지원됩니다:
#
# AIRFLOW_IMAGE_NAME           - Airflow를 실행하는 데 사용되는 Docker 이미지 이름.
#                                기본값: apache/airflow:2.5.1
# AIRFLOW_UID                  - Airflow 컨테이너의 사용자 ID
#                                기본값: 50000
# AIRFLOW_PROJ_DIR             - 모든 파일이 볼륨을 설정할 기본 경로.
#                                기본값: .
# 이러한 구성은 대부분의 경우 독립적인 테스트/시험 모드에서 Airflow를 실행할 때 유용합니다.
#
# _AIRFLOW_WWW_USER_USERNAME   - 관리자 계정의 사용자 이름 (요청시).
#                                기본값: airflow
# _AIRFLOW_WWW_USER_PASSWORD   - 관리자 계정의 비밀번호 (요청시).
#                                기본값: airflow
# _PIP_ADDITIONAL_REQUIREMENTS - 모든 컨테이너를 시작할 때 추가할 추가 PIP 요구 사항.
#                                기본값: ''

---
version: '3'
x-airflow-common:
  &airflow-common

#requirements.txt 파일로 사용자 지정 종속성을 추가하거나 제공자 패키지를 업그레이드하려면

# Dockerfile을 추가해 확장 이미지를 사용할 수 있습니다.
# 'image: ${AIR~' 부분을  주석 처리 하고 docker-compose.yaml 파일이 있는 디렉토리에 Dockerfile을 생성
# 그런 다음 아래 "build" 라인의 주석 처리를 해제, 이후 이미지를 빌드하려면 `docker-compose build`를 실행

# 이미지를 설정하는 부분으로, AIRFLOW_IMAGE_NAME이 설정되어 있으면 해당 값을 사용하고, 
      # 설정되어 있지 않으면 기본값인 apache/airflow:2.5.1을 사용
  image: ${AIRFLOW_IMAGE_NAME:-apache/airflow:2.5.1}
    # build: .
  environment:
    &airflow-common-env
    #Airflow의 실행 모드(Executor)를 설정하는 부분입니다. CeleryExecutor는 Airflow이 작업을 분산하고
    #여러 워커에서 병렬로 실행하기 위해 Celery를 사용하는 실행 모드
    AIRFLOW__CORE__EXECUTOR: CeleryExecutor
        
    #Airflow이 데이터베이스에 연결하기 위한 SQLAlchemy 연결 문자열
    #PostgreSQL 데이터베이스를 사용하며, psycopg2라는 Python 드라이버를 통해 연결
    AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow
                
    # Airflow 2.3 이전의 버전과의 하위 호환성을 위한 것으로, Airflow의 코어 모듈이 
    # 데이터베이스에 연결하는 데 사용하는 SQLAlchemy 연결 문자열
    AIRFLOW__CORE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow
    
    # CeleryExecutor를 사용할 때 Celery 작업 결과를 저장하는 백엔드를 지정
    # CeleryExecutor를 사용하는 경우 작업의 실행 및 결과 추적
    AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:airflow@postgres/airflow
    
    # Redis 서버가 redis로 호스트되고 포트 6379를 사용하며, 브로커 데이터베이스는 0으로 설정
    AIRFLOW__CELERY__BROKER_URL: redis://:@redis:6379/0
    
    # Fernet은 암호화와 관련된 토큰 생성 및 검증을 담당하는 대칭 키 알고리즘
    AIRFLOW__CORE__FERNET_KEY: ''
    
    # DAG가 생성될 때 자동으로 일시 중지되는지 여부
    AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true'
        
    # 예제 dag들을 실행할지 여부
    AIRFLOW__CORE__LOAD_EXAMPLES: 'false'
        
# Airflow API의 인증 백엔드를 지정합니다. 여기서는 기본적으로, basic_auth 및 session
# basic_auth : HTTP 기반의 사용자 이름 및 비밀번호를 사용한 인증/session: 세션을 사용하여 사용자를 인증
    AIRFLOW__API__AUTH_BACKENDS: 'airflow.api.auth.backend.basic_auth,airflow.api.auth.backend.session'
    
    # 추가적으로 설치해야 하는 Python 패키지의 목록
    _PIP_ADDITIONAL_REQUIREMENTS: ${_PIP_ADDITIONAL_REQUIREMENTS:-}
        
     # 볼륨으로 폴더들을 매핑  
  volumes:
    - ${AIRFLOW_PROJ_DIR:-.}/dags:/opt/airflow/dags
    - ${AIRFLOW_PROJ_DIR:-.}/logs:/opt/airflow/logs
    - ${AIRFLOW_PROJ_DIR:-.}/plugins:/opt/airflow/plugins
        
      # AIRFLOW_UID의 값을 사용하며, 해당 값이 설정되어 있지 않으면 기본값으로 50000을 사용  
  user: "${AIRFLOW_UID:-50000}:0"
    # redis,postgres가 healthy할때 컨테이너 시작
  depends_on:
    &airflow-common-depends-on
    redis:
      condition: service_healthy
    postgres:
      condition: service_healthy

services:
  postgres:
    image: postgres:13
    environment:
      POSTGRES_USER: airflow
      POSTGRES_PASSWORD: airflow
      POSTGRES_DB: airflow
    volumes:
      - postgres-db-volume:/var/lib/postgresql/data
    healthcheck:
        # PostgreSQL 서버가 준비 상태인지 확인합니다. -U airflow는 airflow 사용자로 접속하는 것을 의미
      test: ["CMD", "pg_isready", "-U", "airflow"]
        # 건강 상태를 확인하는 주기를 5초로 설정
      interval: 5s
        # 실패할 경우 최대 5번 재시도
      retries: 5
        # 서비스가 종료될 경우 자동으로 재시작하도록 설정
    restart: always
        # redis 서비스는 Redis 이미지를 사용하여 6379번 포트로 노출하며, 주기적으로 
        # redis-cli ping 명령을 사용하여 건강 상태를 확인하고, 문제가 발생할 경우 자동으로 재시작
  redis:
    image: redis:latest
    expose:
      - 6379
    healthcheck:
      test: ["CMD", "redis-cli", "ping"]
      interval: 5s
      timeout: 30s
      retries: 50
    restart: always

    # Airflow 웹 서버를 8080 포트로 노출하고 주기적으로 건강 상태를 확인하며, 
    #문제가 발생할 경우 자동으로 재시작합니다. 또한, airflow-webserver 서비스는 
    #airflow-init 서비스가 성공적으로 실행된 후에만 시작됨
  airflow-webserver:
    # 앵커에서 정의된 환경 변수를 가져옴
    <<: *airflow-common
    command: webserver
    ports:
      - 8080:8080
    healthcheck:
        # 서버가 오류 응답을 반환하면 curl 명령이 실패하도록 만듬
        # 건강 상태를 확인하기 위한 Airflow URL
      test: ["CMD", "curl", "--fail", "http://localhost:8080/health"]
      interval: 10s
      timeout: 10s
      retries: 5
    restart: always
    depends_on:
      <<: *airflow-common-depends-on
      airflow-init:
        condition: service_completed_successfully

  airflow-scheduler:
    <<: *airflow-common
    command: scheduler
    healthcheck:
        # 스케줄러 작업의 건강 상태를 확인하는 Airflow 명령입니다. $${HOSTNAME}는 환경 변수를 나타냅니다. 
        # 스케줄러 작업이 정상적으로 수행되고 있는지 확인하는 용도로 사용됨
      test: ["CMD-SHELL", 'airflow jobs check --job-type SchedulerJob --hostname "$${HOSTNAME}"']
      interval: 10s
      timeout: 10s
      retries: 5
    restart: always
    depends_on:
      <<: *airflow-common-depends-on
      airflow-init:
        condition: service_completed_successfully

  airflow-worker:
    <<: *airflow-common
    command: celery worker
    healthcheck:
      test:
        # 명령이 쉘에서 실행되어야 함
        - "CMD-SHELL"
        # Celery Worker가 실행 중이고 정상적으로 동작하는지 확인하는 Celery 명령입니다. 
        # Airflow의 작업들을 비동기적으로 처리하기 위해 Celery라는 분산 작업 큐 시스템을 이용
        # $${HOSTNAME}는 환경 변수를 나타냅니다. Celery Worker의 건강 상태를 확인하는데 사용됨
        - 'celery --app airflow.executors.celery_executor.app inspect ping -d "celery@$${HOSTNAME}"'
      interval: 10s
      timeout: 10s
      retries: 5
    environment:
      <<: *airflow-common-env
        # DUMB_INIT_SETSID를 0으로 설정하면, dumb-init이 프로세스를 세션 리더로 만들지 않습니다.
        # 세션 리더로 만들지 않는 것은 몇 가지 동작 관련 이슈를 방지하고자 하는이유는.
        # 안정성을 고려한 선택입니다. celery workers의 올바른 웜 셧다운을 처리하는데 도움됩니다.
      DUMB_INIT_SETSID: "0"
    restart: always
    depends_on:
      <<: *airflow-common-depends-on
      airflow-init:
        condition: service_completed_successfully

  airflow-triggerer:
    <<: *airflow-common
    command: triggerer
    healthcheck:
        #CMD-SHELL 테스트 명령어로, Airflow의 작업(triggerer job)이 정상적으로 동작하는지 확인
      test: ["CMD-SHELL", 'airflow jobs check --job-type TriggererJob --hostname "$${HOSTNAME}"']
      interval: 10s
      timeout: 10s
      retries: 5
    restart: always
    depends_on:
      <<: *airflow-common-depends-on
      airflow-init:
        condition: service_completed_successfully

  airflow-init:
    <<: *airflow-common
        # 컨테이너가 시작될 때 Bash 셸을 실행
    entrypoint: /bin/bash
    # yamllint disable rule:line-length
    command:
      - -c
      - |
        function ver() {
          printf "%04d%04d%04d%04d" $${1//./ }
        }
        # 현재 Airflow 버전 확인
        airflow_version=$$(AIRFLOW__LOGGING__LOGGING_LEVEL=INFO && gosu airflow airflow version)
        airflow_version_comparable=$$(ver $${airflow_version})
        
        # 최소 Airflow 버전 확인
        min_airflow_version=2.2.0
        min_airflow_version_comparable=$$(ver $${min_airflow_version})
        
        # 현재 버전이 최소 버전보다 낮으면 오류 출력
        if (( airflow_version_comparable < min_airflow_version_comparable )); then
          echo
          echo -e "\033[1;31mERROR!!!: Too old Airflow version $${airflow_version}!\e[0m"
          echo "The minimum Airflow version supported: $${min_airflow_version}. Only use this or higher!"
          echo
          exit 1
        fi
        
        # AIRFLOW_UID가 설정되지 않았을 때 경고 출력
        if [[ -z "${AIRFLOW_UID}" ]]; then
          echo
          echo -e "\033[1;33mWARNING!!!: AIRFLOW_UID not set!\e[0m"
          echo "If you are on Linux, you SHOULD follow the instructions below to set "
          echo "AIRFLOW_UID environment variable, otherwise files will be owned by root."
          echo "For other operating systems you can get rid of the warning with manually created .env file:"
          echo "    See: https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html#setting-the-right-airflow-user"
          echo
        fi
        
        # 시스템 자원 확인 (메모리, CPU, 디스크)
        one_meg=1048576
        mem_available=$$(($$(getconf _PHYS_PAGES) * $$(getconf PAGE_SIZE) / one_meg))
        cpus_available=$$(grep -cE 'cpu[0-9]+' /proc/stat)
        disk_available=$$(df / | tail -1 | awk '{print $$4}')
        warning_resources="false"
        
        # 메모리 최소사양보다 부족할 경우 경고 출력
        if (( mem_available < 4000 )) ; then
          echo
          echo -e "\033[1;33mWARNING!!!: Not enough memory available for Docker.\e[0m"
          echo "At least 4GB of memory required. You have $$(numfmt --to iec $$((mem_available * one_meg)))"
          echo
          warning_resources="true"
        fi
        # CPU 최소사양보다 부족할 경우 경고 출력
        if (( cpus_available < 2 )); then
          echo
          echo -e "\033[1;33mWARNING!!!: Not enough CPUS available for Docker.\e[0m"
          echo "At least 2 CPUs recommended. You have $${cpus_available}"
          echo
          warning_resources="true"
        fi
        # 디스크 최소사양보다 부족할 경우 경고 출력
        if (( disk_available < one_meg * 10 )); then
          echo
          echo -e "\033[1;33mWARNING!!!: Not enough Disk space available for Docker.\e[0m"
          echo "At least 10 GBs recommended. You have $$(numfmt --to iec $$((disk_available * 1024 )))"
          echo
          warning_resources="true"
        fi
        
        # 시스템 자원 부족 시 경고 출력
        if [[ $${warning_resources} == "true" ]]; then
          echo
          echo -e "\033[1;33mWARNING!!!: You have not enough resources to run Airflow (see above)!\e[0m"
          echo "Please follow the instructions to increase amount of resources available:"
          echo "   https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html#before-you-begin"
          echo
        fi
        
        # Airflow 폴더 및 파일 권한 설정
        mkdir -p /sources/logs /sources/dags /sources/plugins
        chown -R "${AIRFLOW_UID}:0" /sources/{logs,dags,plugins}
        
        #Docker 컨테이너 내에서 실행되는 Airflow의 버전을 확인
        exec /entrypoint airflow version
    # yamllint enable rule:line-length
    environment:
      <<: *airflow-common-env
        # db 업그레이드, 계정 생성후, id/pw 설정
      _AIRFLOW_DB_UPGRADE: 'true'
      _AIRFLOW_WWW_USER_CREATE: 'true'
      _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow}
      _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow}
      _PIP_ADDITIONAL_REQUIREMENTS: ''
    user: "0:0"
    volumes:
      - ${AIRFLOW_PROJ_DIR:-.}:/sources

 

  # airflow-cli은  Apache Airflow의 커맨드 라인 인터페이스(Command Line Interface, CLI)를 나타냅니다. Airflow CLI를
  # 사용하면 Airflow에서 제공하는 다양한 기능을 명령어를 통해 수행

  airflow-cli:
    <<: *airflow-common
    profiles:
      - debug
    environment:
      <<: *airflow-common-env
      CONNECTION_CHECK_MAX_COUNT: "0"
    # Workaround for entrypoint issue. See: https://github.com/apache/airflow/issues/16252
    # Bash 셸에서 문자열을 실행할 때 사용, Airflow CLI를 실행
    command:
      - bash
      - -c
      - airflow

  # You can enable flower by adding "--profile flower" option e.g. docker-compose --profile flower up
  # or by explicitly targeted on the command line e.g. docker-compose up flower.
  # See: https://docs.docker.com/compose/profiles/
    # flower는 Celery 작업 큐를 시각적으로 모니터링하고 관리하기 위한 도구
  flower:
    <<: *airflow-common
    command: celery flower
    profiles:
      - flower
    ports:
      - 5555:5555
    healthcheck:
      test: ["CMD", "curl", "--fail", "http://localhost:5555/"]
      interval: 10s
      timeout: 10s
      retries: 5
    restart: always
    depends_on:
      <<: *airflow-common-depends-on
      airflow-init:
        condition: service_completed_successfully
# Docker Compose 파일에서 볼륨을 설정하는 부분
volumes:
  postgres-db-volume: