vllm/entrypoints/llm.pyclass LLM:"""An LLM for generating texts from given prompts and sampling parameters.
"""def init(self,model: str,tokenizer: Optional[str] = None,tokenizer_mode: str = "auto",trust_remote_code: bool = False,tensor_parallel_size: int = 1,dtype: str = "auto",quantization: Optional[str] = None,revision: Optional[str] = None,tokenizer_revision: Optional[str] = None,seed: int = 0,gpu_memory_utilization: float = 0.9,swap_space: int = 4,enforce_eager: bool = False,max_context_len_to_capture: int = 8192,disable_custom_all_reduce: bool = True,**kwargs,) -> None:...==============================================================================使用配置好的engine参数,初始化LLMEngine实例==============================================================================self.llm_engine = LLMEngine.from_engine_args(engine_args, usage_context=UsageContext.LLM_CLASS)==============================================================================用于全局唯一的request_id,在vLLM中内核引擎的处理中,1个prompt视为1个request,分配全局唯一的request_id==============================================================================self.request_counter = Counter()...def generate(self,prompts: Optional[Union[str, List[str]]] = None,
sampling_params: Optional[SamplingParams] = None,prompt_token_ids: Optional[List[List[int]]] = None,
use_tqdm: bool = True,lora_request: Optional[LoRARequest] = None,multi_modal_data: Optional[MultiModalData] = None,) -> List[RequestOutput]:"""Generates the completions for the input prompts.
NOTE: This class automatically batches the given prompts, considering
the memory constraint. For the best performance, put all of your prompts
into a single list and pass it to this method.
prompts: prompts可以是str,也可以是list[str]
sampling_params: 采样超参,例如温度、top_k等;如果为None则使用vLLM默认的参数
prompt_token_ids: prompt对应的token_id,如果没有提供的话,vllm会调用tokenizer进行 转换
use_tqdm: 是否要展示process bar
lora_request: 如果想请求特定的lora_adapter,可以将它的path等信息包装在该请求中,
但vLLM建议尽量不要使用这种方式,因为私有的lora adapter可能会带来一些
multi_modal_data: 多模态相关的数据
A list of RequestOutput objects containing the generated
completions in the same order as the input prompts.
"""if prompts is None and prompt_token_ids is None:raise ValueError("Either prompts or prompt_token_ids must be ""provided.")if isinstance(prompts, str):Convert a single prompt to a list.prompts = [prompts]if (prompts is not None and prompt_token_ids is not Noneand len(prompts) != len(prompt_token_ids)):raise ValueError("The lengths of prompts and prompt_token_ids ""must be the same.")if sampling_params is None:Use default sampling params.sampling_params = SamplingParams()if multi_modal_data:multi_modal_data.data = multi_modal_data.data.to(torch.float16)============================================================================将request添加到engine中在vLLM内核运算逻辑中,1个prompt算1个request,需要有1个全局唯一的request_id============================================================================num_requests = len(prompts) if prompts is not None else len(prompt_token_ids)for i in range(num_requests):prompt = prompts[i] if prompts is not None else Nonetoken_ids = None if prompt_token_ids is None else prompt_token_ids[i]=======================================================================将每个prompt添加进LLMEngine中,_add_request具体做了以下几件事:- 将每个prompt处理成特定的输入类型(SequenceGroup实例,后文会细说)- 将每个prompt加入Scheduler的waiting队列,等待处理=======================================================================self._add_request(prompt,sampling_params,token_ids,lora_request=lora_request,Get ith image while maintaining the batch dim.multi_modal_data=MultiModalData(type=multi_modal_data.type,data=multi_modal_data.data[i].unsqueeze(0))if multi_modal_data else None,)============================================================================把这个batch的所有prompt都添加完后,执行推理,详情参见_run_engine============================================================================return self._run_engine(use_tqdm)def _add_request(self,prompt: Optional[str],sampling_params: SamplingParams,prompt_token_ids: Optional[List[int]],lora_request: Optional[LoRARequest] = None,multi_modal_data: Optional[MultiModalData] = None,) -> None:每个prompt赋1个request_idrequest_id = str(next(self.request_counter))self.llm_engine.add_request(request_id,prompt,sampling_params,prompt_token_ids,lora_request=lora_request,multi_modal_data=multi_modal_data)def _run_engine(self, use_tqdm: bool) -> List[RequestOutput]:Initialize tqdm.if use_tqdm:num_requests = self.llm_engine.get_num_unfinished_requests()pbar = tqdm(total=num_requests,desc="Processed prompts",dynamic_ncols=True)===========================================================================如果当前调度器中还有没完成推理的请求(调度器中waiting/running/swapped任一队列非空)===========================================================================outputs: List[RequestOutput] = []while self.llm_engine.has_unfinished_requests():=========================================================================执行1次推理调度(step),决定哪些请求的数据可以参与到这次推理中=========================================================================step_outputs = self.llm_engine.step()for output in step_outputs:=====================================================================如果本step后,有请求已经完成了推理,就将推理结果装进outputs中=====================================================================if output.finished:outputs.append(output)if use_tqdm:pbar.update(1)if use_tqdm:pbar.close()Sort the outputs by request ID.This is necessary because some requests may be finished earlier thanits previous requests.outputs = sorted(outputs, key=lambda x: int(x.request_id))return outputs