(self, hidden_states, lora_layer_params=None)
| 2958 | quant_mode=quant_mode) |
| 2959 | |
| 2960 | def forward(self, hidden_states, lora_layer_params=None): |
| 2961 | assert lora_layer_params is None, "lora_layer_params not supported" |
| 2962 | inter = self.fc(hidden_states) |
| 2963 | inter = ACT2FN[self.hidden_act](inter) |
| 2964 | gate = self.gate(hidden_states) |
| 2965 | |
| 2966 | inter_x_gate = inter * gate |
| 2967 | inter_x_gate = quantize_per_token( |
| 2968 | inter_x_gate, |
| 2969 | scale_dtype='float16', |
| 2970 | sum_per_token=not self.quant_mode.has_per_group_scaling(), |
| 2971 | sum_dtype='float16') |
| 2972 | |
| 2973 | output = self.proj(inter_x_gate) |
| 2974 | return output |
| 2975 | |
| 2976 | |
| 2977 | # TODO: Duplicates SmoothQuantAttention. |
nothing calls this directly
no test coverage detected