compile llama.cpp on Linux
First you need to clone llama.cpp repository :
iman@Debian:~/llama.cpp$ git clone https://github.com/ggml-org/llama.cpp
cd llama.cpp
Then save this as build.sh in the llama.cpp directory and chmod +x build.sh
#!/bin/bash
export LANG=en_US.UTF-8
## depends cuda-toolkit cmake curl libcurl4-openssl-dev
# Check dependencies
DEPENDENCIES=(
'cuda-toolkit'
'cmake'
'curl'
'libcurl4-openssl-dev'
)
for i in "${DEPENDENCIES[@]}"; do
dpkg -s $i > /dev/null 2>&1;
if [ $? == 1 ]; then
echo >&2 "'$i' package is required, but not available. Aborting.";
exit 1;
fi
done
# Auto-detect CUDA compute capability
if command -v nvidia-smi &> /dev/null; then
CC=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -1 | tr -d ' ')
echo "π― Detected GPU compute capability: $CC"
ARCH=$(echo $CC | sed 's/\./ /' | awk '{printf "%s%s", $1, $2}')
echo "CUDA_ARCHITECTURES: $ARCH"
else
ARCH="86" # fallback
echo "β οΈ nvidia-smi not found, using default CC 86"
fi
## see https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md
## Export environment variables
## nvidia nvcc CUDA
#export CUDA_HOME=/usr/local/cuda
#export PATH="$PATH:$CUDA_HOME/bin"
#export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64
## Set the CUDA compiler environment variables
#export CMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc
#export CUDACXX=/usr/local/cuda/bin/nvcc
# Parse script arguments
NOZIP=false
while [[ $# -gt 0 ]]; do
case "$1" in
--no-zip|-nz) NOZIP=true ;;
*) ;; # ignore other args
esac
shift
done
# get current git commit and zip bin directory
COMMIT_ID=`git rev-parse --short HEAD`
echo $COMMIT_ID
#FILE="bin-*-$COMMIT_ID.zip"
ZIP="bin-`date +%Y%m%d`-$COMMIT_ID.zip"
if [ "$NOZIP" = false ]; then
if [ ! -f $ZIP ]; then
zip -r "$ZIP" bin/
echo "β
Created: File $ZIP"
else
echo "File $ZIP exists: skipping zip creation."
fi
else
echo "--no-zip flag set: skipping zip creation."
fi
git pull
# Configure the project in the current directory
# build only for Compute Capability of 3060/3090 NVIDIA devices
# see https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md#cuda and https://developer.nvidia.com/cuda/gpus
# https://www.reddit.com/r/LocalLLaMA/comments/1rjpifs/comment/o8eqp9w/
echo "π§ Configuring cmake..."
cmake -B . --fresh -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES="$ARCH" -DGGML_CUDA_FA_ALL_QUANTS=ON
# Build the project in the current directory
# Git version
COMMIT_ID=$(git rev-parse --short HEAD)
echo "π¨ Building commit: $COMMIT_ID"
cmake --build . --config Release -j$(nproc) --clean-first
iman@Debian:~/llama.cpp$ ./build.sh
iman@Debian:~/llama.cpp$ ls bin
export-graph-ops libllama.so.0.0.8621 llama-gguf-split test-c
libggml-base.so libllama.so.0.0.8655 llama-idle test-chat
libggml-base.so.0 libllama.so.0.0.8667 llama-imatrix test-chat-auto-parser
libggml-base.so.0.9.10 libmtmd.so llama-llava-cli test-chat-peg-parser
libggml-base.so.0.9.11 libmtmd.so.0 llama-lookahead test-chat-template
libggml-base.so.0.9.7 libmtmd.so.0.0.8343 llama-lookup test-gbnf-validator
libggml-base.so.0.9.8 libmtmd.so.0.0.8382 llama-lookup-create test-gguf
libggml-cpu.so libmtmd.so.0.0.8412 llama-lookup-merge test-gguf-model-data
libggml-cpu.so.0 libmtmd.so.0.0.8455 llama-lookup-stats test-grammar-integration
libggml-cpu.so.0.9.10 libmtmd.so.0.0.8461 llama-minicpmv-cli test-grammar-parser
libggml-cpu.so.0.9.11 libmtmd.so.0.0.8508 llama-mtmd-cli test-jinja
libggml-cpu.so.0.9.7 libmtmd.so.0.0.8530 llama-mtmd-debug test-json-partial
libggml-cpu.so.0.9.8 libmtmd.so.0.0.8541 llama-parallel test-json-schema-to-grammar
libggml-cuda.so libmtmd.so.0.0.8563 llama-passkey test-llama-archs
libggml-cuda.so.0 libmtmd.so.0.0.8621 llama-perplexity test-llama-grammar
libggml-cuda.so.0.9.10 libmtmd.so.0.0.8655 llama-q8dot test-log
libggml-cuda.so.0.9.11 libmtmd.so.0.0.8667 llama-quantize test-model-load-cancel
libggml-cuda.so.0.9.7 llama-batched llama-qwen2vl-cli test-mtmd-c-api
libggml-cuda.so.0.9.8 llama-batched-bench llama-results test-opt
libggml.so llama-bench llama-retrieval test-peg-parser
libggml.so.0 llama-cli llama-save-load-state test-quantize-fns
libggml.so.0.9.10 llama-completion llama-server test-quantize-perf
libggml.so.0.9.11 llama-convert-llama2c-to-ggml llama-simple test-quantize-stats
libggml.so.0.9.7 llama-cvector-generator llama-simple-chat test-quant-type-selection
libggml.so.0.9.8 llama-debug llama-speculative test-reasoning-budget
libllama.so llama-debug-template-parser llama-speculative-simple test-regex-partial
libllama.so.0 llama-diffusion-cli llama-template-analysis test-rope
libllama.so.0.0.8343 llama-embedding llama-tokenize test-sampling
libllama.so.0.0.8382 llama-eval-callback llama-tts test-state-restore-fragmented
libllama.so.0.0.8412 llama-export-lora llama-vdot test-thread-safety
libllama.so.0.0.8455 llama-finetune test-alloc test-tokenizer-0
libllama.so.0.0.8461 llama-fit-params test-arg-parser test-tokenizer-1-bpe
libllama.so.0.0.8508 llama-gemma3-cli test-autorelease test-tokenizer-1-spm
libllama.so.0.0.8530 llama-gen-docs test-backend-ops
libllama.so.0.0.8541 llama-gguf test-backend-sampler
libllama.so.0.0.8563 llama-gguf-hash test-barrier
iman@Debian:~/llama.cpp$ bin/llama-server -m /path/to/model.gguf --alias "Model-Alias" --n-gpu-layers 999 --cpu-moe --host 127.0.0.1 --port 5000 --flash-attn on --fit on --sleep-idle-seconds 30
Check llama-server README.md
Category:
LLM