Reference Architecture & Stack

version: "3.9"
services:
  trtllm-build:
    image: "nvcr.io/nvidia/trt-llm:XX.YY-py3"
    runtime: nvidia
    volumes:
      - /opt/models:/opt/models
      - /opt/hf:/opt/hf
      - /opt/finisher/trtllm-build:/workspace
    command: >-
      bash -lc "trtllm-build --model-name Mixtral-8x22B-Instruct
        --checkpoint-dir /opt/models/mixtral-8x22b-instruct
        --output-dir /opt/models/trt-engines/mixtral-8x22b-instruct
        --dtype bf16 --gpus 8 --tp 8 --enable-context-fmha
        --use-inflight-batching --moe-enable && echo DONE"

  triton:
    image: "nvcr.io/nvidia/tritonserver:XX.YY-py3"
    runtime: nvidia
    ports: ["8000:8000","8001:8001","8002:8002"]

  weaviate:
    image: "semitechnologies/weaviate:1.26.7"
    environment:
      DEFAULT_VECTORIZER_MODULE: "none"
      PERSISTENCE_DATA_PATH: "/var/lib/weaviate"
    ports: ["8080:8080"]

  finisher:
    build: { context: /opt/finisher, dockerfile: Dockerfile }
    environment:
      WEAVIATE_ENDPOINT: "http://weaviate:8080"
      TRITON_GRPC_URL: "triton:8001"
AI Reference Architecture

High-Level Services

Sample docker-compose (excerpt)