Function generate_without_cb

examples/pytorch/continuous_batching.py:33–51 · view source on GitHub ↗

(
    model_id: str, sliding_window: int, attn_impl: str, batched_inputs: list[int], generation_config: GenerationConfig
)

Source from the content-addressed store, hash-verified

31
32
33	def generate_without_cb(
34	model_id: str, sliding_window: int, attn_impl: str, batched_inputs: list[int], generation_config: GenerationConfig
35	) -> dict[str, str]:
36	# Setup model and tokenizer
37	model = AutoModelForCausalLM.from_pretrained(model_id, dtype=torch.bfloat16, attn_implementation=attn_impl)
38	model = model.cuda().eval() # type: ignore
39	if sliding_window > 0 and getattr(model.config, "sliding_window", None) is not None:
40	model.config.sliding_window = sliding_window
41	tokenizer = AutoTokenizer.from_pretrained(model_id)
42	# Generate one by one
43	decoded_outputs = {}
44	for input_ids in tqdm(batched_inputs, desc="Generating outputs without CB"):
45	key = " ".join(map(str, input_ids)) # This will be used to identify the output after batched generation
46	input_ids = torch.tensor([input_ids]).to("cuda")
47	attention_mask = torch.ones_like(input_ids)
48	outputs = model.generate(input_ids, attention_mask=attention_mask, generation_config=generation_config)
49	generated_tokens = outputs[0][input_ids.shape[1] :]
50	decoded_outputs[key] = tokenizer.decode(generated_tokens, skip_special_tokens=False) # type: ignore
51	return decoded_outputs
52
53
54	def batch_generate(

continuous_batching.pyFile · 0.85

evalMethod · 0.80

joinMethod · 0.80

from_pretrainedMethod · 0.45

cudaMethod · 0.45

toMethod · 0.45

generateMethod · 0.45

decodeMethod · 0.45

no test coverage detected