-
-
Notifications
You must be signed in to change notification settings - Fork 329
Expand file tree
/
Copy pathtest-moe-offloading.sh
More file actions
51 lines (43 loc) · 1.6 KB
/
test-moe-offloading.sh
File metadata and controls
51 lines (43 loc) · 1.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
#!/bin/bash
# GPT-OSS MoE CPU Offloading Test Script
# Tests shimmy with and without --cpu-moe flag to demonstrate VRAM reduction
MODEL_PATH="./models/gpt-oss-20b-Q4_K_M.gguf"
SHIMMY_BIN="./target/release/shimmy.exe"
echo "========================================="
echo "GPT-OSS MoE CPU Offloading Test"
echo "========================================="
echo ""
echo "Model: gpt-oss-20b-Q4_K_M (11.6 GB)"
echo "GPU: RTX 3060 (4GB VRAM)"
echo ""
# Test 1: Try WITHOUT MoE offloading (will likely fail/OOM)
echo "----------------------------------------"
echo "TEST 1: WITHOUT MoE offloading"
echo "Expected: VRAM overflow or very slow"
echo "----------------------------------------"
echo ""
echo "Running: shimmy probe (no --cpu-moe flag)"
echo ""
SHIMMY_BASE_GGUF="$MODEL_PATH" timeout 60s "$SHIMMY_BIN" probe gpt-oss-20b 2>&1 | tee test-no-moe.log || true
echo ""
echo ""
# Test 2: WITH MoE CPU offloading
echo "----------------------------------------"
echo "TEST 2: WITH --cpu-moe flag"
echo "Expected: Experts offloaded, fits in VRAM"
echo "----------------------------------------"
echo ""
echo "Running: shimmy serve --cpu-moe"
echo ""
SHIMMY_BASE_GGUF="$MODEL_PATH" timeout 60s "$SHIMMY_BIN" serve --bind 127.0.0.1:11435 --cpu-moe 2>&1 | tee test-with-moe.log || true
echo ""
echo ""
echo "========================================="
echo "Test Complete!"
echo "========================================="
echo ""
echo "Check logs:"
echo " - test-no-moe.log: Baseline (should show VRAM issues)"
echo " - test-with-moe.log: With MoE offloading (should succeed)"
echo ""
echo "Look for 'MoE:' log lines in test-with-moe.log"