vllm call for multiple GPUs, just fyi.

#16
by silvacarl - opened

from vllm import LLM
from vllm.sampling_params import SamplingParams

model_name = "mistralai/Mistral-Small-Instruct-2409"

sampling_params = SamplingParams(max_tokens=8192)

note that running Mistral-Small on a single GPU requires at least 44 GB of GPU RAM

If you want to divide the GPU requirement over multiple devices, please add e.g. tensor_parallel=2

llm = LLM(model=model_name, tokenizer_mode="mistral", config_format="mistral", load_format="mistral", tensor_parallel_size=2)
prompt = "How often does the letter r occur in Mistral?"

messages = [
{
"role": "user",
"content": prompt
},
]

outputs = llm.chat(messages, sampling_params=sampling_params)

print(outputs[0].outputs[0].text)

also: vllm serve mistralai/Mistral-Small-Instruct-2409 --tokenizer_mode mistral --config_format mistral --load_format mistral --tensor-parallel-size 2

Sign up or log in to comment