Serveless quants
Hi, how do you specify a specific gguf quant file from a hf repo when configuring a vllm serveless endpoint? Only seems to let you specify the repo level.
MINICONDA_URL="https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh"
VLLM_USE_NIGHTLY=1
apt-get update
if [ ! -x /usr/bin/sudo ]; then
apt-get install -y sudo
fi
if [ ! -x /usr/bin/wget ]; then
sudo apt-get install -y wget
fi
if [ ! -x /usr/bin/screen ]; then
sudo apt-get install -y screen
fi
if [ ! -x /usr/bin/nvtop ]; then
sudo apt-get install -y nvtop
fi
# install miniconda
mkdir -p ~/miniconda3
wget $MINICONDA_URL -O ~/miniconda3/miniconda.sh
bash ~/miniconda3/miniconda.sh -b -u -p ~/miniconda3
rm ~/miniconda3/miniconda.sh
~/miniconda3/condabin/conda init bash
source ~/.bashrc
# install vllm and dependencies
conda create -n vllm python=3.12 -y
conda activate vllm
python -m pip install --upgrade pip
pip install -U "huggingface_hub[cli]"
if [ $VLLM_USE_NIGHTLY -eq 1 ]; then
pip install vllm --pre --extra-index-url https://wheels.vllm.ai/nightly
else
pip install vllm
fiVLLM_API_KEY="asdfasdf"
MODEL_REPO="bartowski/DeepSeek-R1-Distill-Llama-70B-GGUF"
ORIG_MODEL="deepseek-ai/DeepSeek-R1-Distill-Llama-70B"
MODEL_FILE="DeepSeek-R1-Distill-Llama-70B-Q4_K_M.gguf"
RANDOM_SEED=42
chmod +x /install_requirements.sh
source /install_requirements.sh
rm /install_requirements.sh
#huggingface-cli login --token $HF_TOKEN
if [ ! -f "/workspace/models/${MODEL_REPO}/${MODEL_FILE}" ]; then
mkdir -p "/workspace/models/${MODEL_REPO}"
huggingface-cli download "${MODEL_REPO}" "${MODEL_FILE}" --local-dir "/workspace/models/${MODEL_REPO}"
fi
vllm serve \
"/workspace/models/${MODEL_REPO}/${MODEL_FILE}" \
--port 80 --api-key "${VLLM_API_KEY}" \
--enable-reasoning --reasoning-parser "deepseek_r1" \
--tokenizer "${ORIG_MODEL}" --kv-cache-dtype "auto" \
--max-model-len 16384 --pipeline-parallel-size 1 \
--tensor-parallel-size 2 --seed "${RANDOM_SEED}" \
--swap-space 4 --cpu-offload-gb 0 --gpu-memory-utilization 0.95 \
--quantization "gguf" --device "cuda"