(
logger: Logger,
repository: str,
branch: str,
commit_id: str,
commit_msg: str,
metrics_recorder=None,
num_tokens_to_generate=100,
)
| 65 | |
| 66 | |
| 67 | def run_benchmark( |
| 68 | logger: Logger, |
| 69 | repository: str, |
| 70 | branch: str, |
| 71 | commit_id: str, |
| 72 | commit_msg: str, |
| 73 | metrics_recorder=None, |
| 74 | num_tokens_to_generate=100, |
| 75 | ): |
| 76 | class="cm"># Check if required ML dependencies are available |
| 77 | if not TRANSFORMERS_AVAILABLE: |
| 78 | logger.error(class="st">"Transformers and torch are required to run the LLaMA benchmark. Please install them with:") |
| 79 | logger.error(class="st">"pip install torch transformers") |
| 80 | logger.error(class="st">"Skipping LLaMA benchmark due to missing dependencies.") |
| 81 | return |
| 82 | |
| 83 | continue_metric_collection = Event() |
| 84 | metrics_thread = None |
| 85 | model_id = class="st">"meta-llama/Llama-2-7b-hf" |
| 86 | |
| 87 | class="cm"># If no metrics_recorder is provided, create one for backward compatibility |
| 88 | if metrics_recorder is None: |
| 89 | try: |
| 90 | metrics_recorder = MetricsRecorder( |
| 91 | psycopg2.connect(class="st">"dbname=metrics"), logger, repository, branch, commit_id, commit_msg, True |
| 92 | ) |
| 93 | should_close_recorder = True |
| 94 | except Exception as e: |
| 95 | logger.error(fclass="st">"Failed to create metrics recorder: {e}") |
| 96 | return |
| 97 | else: |
| 98 | should_close_recorder = False |
| 99 | try: |
| 100 | gpu_stats = gpustat.GPUStatCollection.new_query() |
| 101 | gpu_name = gpu_stats[0][class="st">"name"] |
| 102 | benchmark_id = metrics_recorder.initialise_benchmark({class="st">"gpu_name": gpu_name, class="st">"model_id": model_id}) |
| 103 | logger.info(fclass="st">"running benchmark class="cm">#{benchmark_id} on {gpu_name} for {model_id}") |
| 104 | metrics_thread = Thread( |
| 105 | target=collect_metrics, |
| 106 | args=[benchmark_id, continue_metric_collection, metrics_recorder], |
| 107 | ) |
| 108 | metrics_thread.start() |
| 109 | logger.info(class="st">"started background thread to fetch device metrics") |
| 110 | |
| 111 | os.environ[class="st">"TOKENIZERS_PARALLELISM"] = class="st">"false" class="cm"># silence warnings when compiling |
| 112 | |
| 113 | device = class="st">"cuda" |
| 114 | |
| 115 | logger.info(class="st">"downloading weights") |
| 116 | class="cm"># This is to avoid counting download in model load time measurement |
| 117 | model = AutoModelForCausalLM.from_pretrained(model_id, dtype=torch.float16) |
| 118 | gen_config = GenerationConfig(do_sample=False, top_p=1, temperature=1) |
| 119 | logger.info(class="st">"loading model") |
| 120 | start = perf_counter() |
| 121 | model = AutoModelForCausalLM.from_pretrained( |
| 122 | model_id, dtype=torch.float16, generation_config=gen_config |
| 123 | ).eval() |
| 124 | model.to(device) |
nothing calls this directly
no test coverage detected