AI Reference Architecture
Triton/TensorRT-LLM, vector DB, secure proxy, optional GUI.
High-Level Services
Sample docker-compose (excerpt)
version: "3.9"
services:
trtllm-build:
image: "nvcr.io/nvidia/trt-llm:XX.YY-py3"
runtime: nvidia
volumes:
- /opt/models:/opt/models
- /opt/hf:/opt/hf
- /opt/finisher/trtllm-build:/workspace
command: >-
bash -lc "trtllm-build --model-name Mixtral-8x22B-Instruct
--checkpoint-dir /opt/models/mixtral-8x22b-instruct
--output-dir /opt/models/trt-engines/mixtral-8x22b-instruct
--dtype bf16 --gpus 8 --tp 8 --enable-context-fmha
--use-inflight-batching --moe-enable && echo DONE"
triton:
image: "nvcr.io/nvidia/tritonserver:XX.YY-py3"
runtime: nvidia
ports: ["8000:8000","8001:8001","8002:8002"]
weaviate:
image: "semitechnologies/weaviate:1.26.7"
environment:
DEFAULT_VECTORIZER_MODULE: "none"
PERSISTENCE_DATA_PATH: "/var/lib/weaviate"
ports: ["8080:8080"]
finisher:
build: { context: /opt/finisher, dockerfile: Dockerfile }
environment:
WEAVIATE_ENDPOINT: "http://weaviate:8080"
TRITON_GRPC_URL: "triton:8001"