(
model_id: str, sliding_window: int, attn_impl: str, batched_inputs: list[int], generation_config: GenerationConfig
)
| 31 | |
| 32 | |
| 33 | def generate_without_cb( |
| 34 | model_id: str, sliding_window: int, attn_impl: str, batched_inputs: list[int], generation_config: GenerationConfig |
| 35 | ) -> dict[str, str]: |
| 36 | # Setup model and tokenizer |
| 37 | model = AutoModelForCausalLM.from_pretrained(model_id, dtype=torch.bfloat16, attn_implementation=attn_impl) |
| 38 | model = model.cuda().eval() # type: ignore |
| 39 | if sliding_window > 0 and getattr(model.config, "sliding_window", None) is not None: |
| 40 | model.config.sliding_window = sliding_window |
| 41 | tokenizer = AutoTokenizer.from_pretrained(model_id) |
| 42 | # Generate one by one |
| 43 | decoded_outputs = {} |
| 44 | for input_ids in tqdm(batched_inputs, desc="Generating outputs without CB"): |
| 45 | key = " ".join(map(str, input_ids)) # This will be used to identify the output after batched generation |
| 46 | input_ids = torch.tensor([input_ids]).to("cuda") |
| 47 | attention_mask = torch.ones_like(input_ids) |
| 48 | outputs = model.generate(input_ids, attention_mask=attention_mask, generation_config=generation_config) |
| 49 | generated_tokens = outputs[0][input_ids.shape[1] :] |
| 50 | decoded_outputs[key] = tokenizer.decode(generated_tokens, skip_special_tokens=False) # type: ignore |
| 51 | return decoded_outputs |
| 52 | |
| 53 | |
| 54 | def batch_generate( |
no test coverage detected