From e06c4421cb418c4fdc08a6c58c207630e50c1d31 Mon Sep 17 00:00:00 2001 From: qubvel Date: Tue, 8 Apr 2025 17:27:24 +0000 Subject: [PATCH 01/32] Albert --- .../models/albert/modeling_albert.py | 106 ++++++------------ 1 file changed, 32 insertions(+), 74 deletions(-) diff --git a/src/transformers/models/albert/modeling_albert.py b/src/transformers/models/albert/modeling_albert.py index 68abb317ee3f..b199a087326a 100755 --- a/src/transformers/models/albert/modeling_albert.py +++ b/src/transformers/models/albert/modeling_albert.py @@ -46,6 +46,7 @@ add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, + can_return_tuple, logging, replace_return_docstrings, ) @@ -508,6 +509,7 @@ def __init__(self, config: AlbertConfig): self.embedding_hidden_mapping_in = nn.Linear(config.embedding_size, config.hidden_size) self.albert_layer_groups = nn.ModuleList([AlbertLayerGroup(config) for _ in range(config.num_hidden_groups)]) + @can_return_tuple def forward( self, hidden_states: torch.Tensor, @@ -515,8 +517,7 @@ def forward( head_mask: Optional[torch.FloatTensor] = None, output_attentions: bool = False, output_hidden_states: bool = False, - return_dict: bool = True, - ) -> Union[BaseModelOutput, Tuple]: + ) -> BaseModelOutput: hidden_states = self.embedding_hidden_mapping_in(hidden_states) all_hidden_states = (hidden_states,) if output_hidden_states else None @@ -546,8 +547,6 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - if not return_dict: - return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None) return BaseModelOutput( last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions ) @@ -733,6 +732,7 @@ def _prune_heads(self, heads_to_prune: Dict[int, List[int]]) -> None: inner_group_idx = int(layer - group_idx * self.config.inner_group_num) self.encoder.albert_layer_groups[group_idx].albert_layers[inner_group_idx].attention.prune_heads(heads) + @can_return_tuple @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -749,13 +749,11 @@ def forward( inputs_embeds: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[BaseModelOutputWithPooling, Tuple]: + ) -> BaseModelOutputWithPooling: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -802,22 +800,18 @@ def forward( head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) - encoder_outputs = self.encoder( + encoder_outputs: BaseModelOutput = self.encoder( embedding_output, extended_attention_mask, head_mask=head_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = encoder_outputs[0] + sequence_output = encoder_outputs.last_hidden_state pooled_output = self.pooler_activation(self.pooler(sequence_output[:, 0])) if self.pooler is not None else None - if not return_dict: - return (sequence_output, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPooling( last_hidden_state=sequence_output, pooler_output=pooled_output, @@ -855,6 +849,7 @@ def set_output_embeddings(self, new_embeddings: nn.Linear) -> None: def get_input_embeddings(self) -> nn.Embedding: return self.albert.embeddings.word_embeddings + @can_return_tuple @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=AlbertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) def forward( @@ -869,8 +864,7 @@ def forward( sentence_order_label: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[AlbertForPreTrainingOutput, Tuple]: + ) -> AlbertForPreTrainingOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., @@ -899,9 +893,8 @@ def forward( >>> prediction_logits = outputs.prediction_logits >>> sop_logits = outputs.sop_logits ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.albert( + outputs: BaseModelOutputWithPooling = self.albert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -910,10 +903,10 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output, pooled_output = outputs[:2] + sequence_output = outputs.last_hidden_state + pooled_output = outputs.pooler_output prediction_scores = self.predictions(sequence_output) sop_scores = self.sop_classifier(pooled_output) @@ -925,10 +918,6 @@ def forward( sentence_order_loss = loss_fct(sop_scores.view(-1, 2), sentence_order_label.view(-1)) total_loss = masked_lm_loss + sentence_order_loss - if not return_dict: - output = (prediction_scores, sop_scores) + outputs[2:] - return ((total_loss,) + output) if total_loss is not None else output - return AlbertForPreTrainingOutput( loss=total_loss, prediction_logits=prediction_scores, @@ -1007,6 +996,7 @@ def set_output_embeddings(self, new_embeddings: nn.Linear) -> None: def get_input_embeddings(self) -> nn.Embedding: return self.albert.embeddings.word_embeddings + @can_return_tuple @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC) def forward( @@ -1020,8 +1010,7 @@ def forward( labels: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[MaskedLMOutput, Tuple]: + ) -> MaskedLMOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., @@ -1059,9 +1048,8 @@ def forward( 0.81 ``` """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.albert( + outputs: BaseModelOutputWithPooling = self.albert( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1070,10 +1058,8 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_outputs = outputs[0] - + sequence_outputs = outputs.last_hidden_state prediction_scores = self.predictions(sequence_outputs) masked_lm_loss = None @@ -1081,10 +1067,6 @@ def forward( loss_fct = CrossEntropyLoss() masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) - if not return_dict: - output = (prediction_scores,) + outputs[2:] - return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output - return MaskedLMOutput( loss=masked_lm_loss, logits=prediction_scores, @@ -1113,6 +1095,7 @@ def __init__(self, config: AlbertConfig): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint="textattack/albert-base-v2-imdb", @@ -1132,17 +1115,15 @@ def forward( labels: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[SequenceClassifierOutput, Tuple]: + ) -> SequenceClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.albert( + outputs: BaseModelOutputWithPooling = self.albert( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1151,10 +1132,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - pooled_output = outputs[1] + pooled_output = outputs.pooler_output pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) @@ -1182,10 +1162,6 @@ def forward( loss_fct = BCEWithLogitsLoss() loss = loss_fct(logits, labels) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return SequenceClassifierOutput( loss=loss, logits=logits, @@ -1218,6 +1194,7 @@ def __init__(self, config: AlbertConfig): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -1235,15 +1212,13 @@ def forward( labels: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[TokenClassifierOutput, Tuple]: + ) -> TokenClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.albert( + outputs: BaseModelOutputWithPooling = self.albert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1252,10 +1227,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state sequence_output = self.dropout(sequence_output) logits = self.classifier(sequence_output) @@ -1265,10 +1239,6 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return TokenClassifierOutput( loss=loss, logits=logits, @@ -1295,6 +1265,7 @@ def __init__(self, config: AlbertConfig): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint="twmkn9/albert-base-v2-squad2", @@ -1317,8 +1288,7 @@ def forward( end_positions: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[AlbertForPreTrainingOutput, Tuple]: + ) -> QuestionAnsweringModelOutput: r""" start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for position (index) of the start of the labelled span for computing the token classification loss. @@ -1329,9 +1299,8 @@ def forward( Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.albert( + outputs: BaseModelOutputWithPooling = self.albert( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1340,10 +1309,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state logits: torch.Tensor = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) @@ -1367,10 +1335,6 @@ def forward( end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 - if not return_dict: - output = (start_logits, end_logits) + outputs[2:] - return ((total_loss,) + output) if total_loss is not None else output - return QuestionAnsweringModelOutput( loss=total_loss, start_logits=start_logits, @@ -1398,6 +1362,7 @@ def __init__(self, config: AlbertConfig): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -1415,15 +1380,13 @@ def forward( labels: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[AlbertForPreTrainingOutput, Tuple]: + ) -> MultipleChoiceModelOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices-1]` where *num_choices* is the size of the second dimension of the input tensors. (see *input_ids* above) """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None @@ -1435,7 +1398,7 @@ def forward( if inputs_embeds is not None else None ) - outputs = self.albert( + outputs: BaseModelOutputWithPooling = self.albert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1444,10 +1407,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - pooled_output = outputs[1] + pooled_output = outputs.pooler_output pooled_output = self.dropout(pooled_output) logits: torch.Tensor = self.classifier(pooled_output) @@ -1458,10 +1420,6 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(reshaped_logits, labels) - if not return_dict: - output = (reshaped_logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return MultipleChoiceModelOutput( loss=loss, logits=reshaped_logits, From 7ac9391724c4990bb54731d6e4355bd7c75cead7 Mon Sep 17 00:00:00 2001 From: qubvel Date: Tue, 8 Apr 2025 17:38:52 +0000 Subject: [PATCH 02/32] Align --- .../models/align/modeling_align.py | 109 +++++------------- 1 file changed, 26 insertions(+), 83 deletions(-) diff --git a/src/transformers/models/align/modeling_align.py b/src/transformers/models/align/modeling_align.py index a007b7a7c6d6..a45d65b5ba2e 100644 --- a/src/transformers/models/align/modeling_align.py +++ b/src/transformers/models/align/modeling_align.py @@ -35,6 +35,7 @@ ModelOutput, add_start_docstrings, add_start_docstrings_to_model_forward, + can_return_tuple, logging, replace_return_docstrings, ) @@ -643,11 +644,11 @@ def round_repeats(repeats): self.blocks = nn.ModuleList(blocks) + @can_return_tuple def forward( self, hidden_states: torch.FloatTensor, output_hidden_states: Optional[bool] = False, - return_dict: Optional[bool] = True, ) -> BaseModelOutputWithPoolingAndNoAttention: all_hidden_states = (hidden_states,) if output_hidden_states else None @@ -656,9 +657,6 @@ def forward( if output_hidden_states: all_hidden_states += (hidden_states,) - if not return_dict: - return tuple(v for v in [hidden_states, all_hidden_states] if v is not None) - return BaseModelOutputWithNoAttention( last_hidden_state=hidden_states, hidden_states=all_hidden_states, @@ -1063,6 +1061,7 @@ def __init__(self, config): self.layer = nn.ModuleList([AlignTextLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False + @can_return_tuple def forward( self, hidden_states: torch.Tensor, @@ -1074,8 +1073,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = False, output_hidden_states: Optional[bool] = False, - return_dict: Optional[bool] = True, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: + ) -> BaseModelOutputWithPastAndCrossAttentions: all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None @@ -1128,18 +1126,6 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - if not return_dict: - return tuple( - v - for v in [ - hidden_states, - next_decoder_cache, - all_hidden_states, - all_self_attentions, - all_cross_attentions, - ] - if v is not None - ) return BaseModelOutputWithPastAndCrossAttentions( last_hidden_state=hidden_states, past_key_values=next_decoder_cache, @@ -1220,6 +1206,7 @@ def get_input_embeddings(self): def set_input_embeddings(self, value): self.embeddings.word_embeddings = value + @can_return_tuple @add_start_docstrings_to_model_forward(ALIGN_TEXT_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=BaseModelOutputWithPoolingAndCrossAttentions, config_class=AlignTextConfig) def forward( @@ -1232,8 +1219,7 @@ def forward( inputs_embeds: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPoolingAndCrossAttentions]: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: r""" Returns: @@ -1255,7 +1241,6 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -1298,20 +1283,16 @@ def forward( token_type_ids=token_type_ids, inputs_embeds=inputs_embeds, ) - encoder_outputs = self.encoder( + encoder_outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.encoder( embedding_output, attention_mask=extended_attention_mask, head_mask=head_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = encoder_outputs[0] + sequence_output = encoder_outputs.last_hidden_state pooled_output = self.pooler(sequence_output) if self.pooler is not None else None - if not return_dict: - return (sequence_output, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPoolingAndCrossAttentions( last_hidden_state=sequence_output, pooler_output=pooled_output, @@ -1350,14 +1331,14 @@ def __init__(self, config: AlignVisionConfig): def get_input_embeddings(self) -> nn.Module: return self.vision_model.embeddings.convolution + @can_return_tuple @add_start_docstrings_to_model_forward(ALIGN_VISION_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=BaseModelOutputWithPoolingAndNoAttention, config_class=AlignVisionConfig) def forward( self, pixel_values: Optional[torch.FloatTensor] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPoolingAndNoAttention]: + ) -> BaseModelOutputWithPoolingAndNoAttention: r""" Returns: @@ -1383,26 +1364,21 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if pixel_values is None: raise ValueError("You have to specify pixel_values") embedding_output = self.embeddings(pixel_values) - encoder_outputs = self.encoder( + encoder_outputs: BaseModelOutputWithPoolingAndNoAttention = self.encoder( embedding_output, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) # Apply pooling - last_hidden_state = encoder_outputs[0] + last_hidden_state = encoder_outputs.last_hidden_state pooled_output = self.pooler(last_hidden_state) # Reshape (batch_size, projection_dim, 1 , 1) -> (batch_size, projection_dim) pooled_output = pooled_output.reshape(pooled_output.shape[:2]) - if not return_dict: - return (last_hidden_state, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPoolingAndNoAttention( last_hidden_state=last_hidden_state, pooler_output=pooled_output, @@ -1453,9 +1429,6 @@ def get_text_features( position_ids: Optional[torch.Tensor] = None, head_mask: Optional[torch.Tensor] = None, inputs_embeds: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, ) -> torch.FloatTensor: r""" Returns: @@ -1473,37 +1446,22 @@ def get_text_features( >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt") >>> text_features = model.get_text_features(**inputs) ```""" - # Use ALIGN model's config for some fields (if specified) instead of those of vision & text components. - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - text_outputs = self.text_model( + text_outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.text_model( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + output_attentions=False, + output_hidden_states=False, ) - - last_hidden_state = text_outputs[0][:, 0, :] + last_hidden_state = text_outputs.last_hidden_state[:, 0, :] text_features = self.text_projection(last_hidden_state) - return text_features @add_start_docstrings_to_model_forward(ALIGN_VISION_INPUTS_DOCSTRING) - def get_image_features( - self, - pixel_values: Optional[torch.FloatTensor] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> torch.FloatTensor: + def get_image_features(self, pixel_values: Optional[torch.FloatTensor] = None) -> torch.FloatTensor: r""" Returns: image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by @@ -1526,22 +1484,15 @@ def get_image_features( >>> image_features = model.get_image_features(**inputs) ```""" - # Use ALIGN model's config for some fields (if specified) instead of those of vision & text components. - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - vision_outputs = self.vision_model( + vision_outputs: BaseModelOutputWithPoolingAndNoAttention = self.vision_model( pixel_values=pixel_values, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + output_hidden_states=False, ) - - image_features = vision_outputs[1] # pooled_output - + image_features = vision_outputs.pooler_output return image_features + @can_return_tuple @add_start_docstrings_to_model_forward(ALIGN_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=AlignOutput, config_class=AlignConfig) def forward( @@ -1556,8 +1507,7 @@ def forward( return_loss: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, AlignOutput]: + ) -> AlignOutput: r""" Returns: @@ -1587,15 +1537,13 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - vision_outputs = self.vision_model( + vision_outputs: BaseModelOutputWithPoolingAndNoAttention = self.vision_model( pixel_values=pixel_values, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - text_outputs = self.text_model( + text_outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.text_model( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1604,11 +1552,10 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - image_embeds = vision_outputs[1] - text_embeds = text_outputs[0][:, 0, :] + image_embeds = vision_outputs.pooler_output + text_embeds = text_outputs.last_hidden_state[:, 0, :] text_embeds = self.text_projection(text_embeds) # normalized features @@ -1623,10 +1570,6 @@ def forward( if return_loss: loss = align_loss(logits_per_text) - if not return_dict: - output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs) - return ((loss,) + output) if loss is not None else output - return AlignOutput( loss=loss, logits_per_image=logits_per_image, From 5ac0bff012b7f1bd9475c0c51b24331d243095f6 Mon Sep 17 00:00:00 2001 From: qubvel Date: Wed, 9 Apr 2025 12:50:42 +0000 Subject: [PATCH 03/32] Bert (breaks fx tests) --- src/transformers/models/bert/modeling_bert.py | 154 +++++------------- 1 file changed, 41 insertions(+), 113 deletions(-) diff --git a/src/transformers/models/bert/modeling_bert.py b/src/transformers/models/bert/modeling_bert.py index 38d2c8c8b5a5..75bfdb480e3b 100755 --- a/src/transformers/models/bert/modeling_bert.py +++ b/src/transformers/models/bert/modeling_bert.py @@ -19,7 +19,7 @@ import os import warnings from dataclasses import dataclass -from typing import List, Optional, Tuple, Union +from typing import List, Optional, Tuple import torch import torch.utils.checkpoint @@ -51,6 +51,7 @@ add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, + can_return_tuple, get_torch_version, logging, replace_return_docstrings, @@ -648,6 +649,7 @@ def __init__(self, config): self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False + @can_return_tuple def forward( self, hidden_states: torch.Tensor, @@ -659,8 +661,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = False, output_hidden_states: Optional[bool] = False, - return_dict: Optional[bool] = True, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: + ) -> BaseModelOutputWithPastAndCrossAttentions: all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None @@ -713,18 +714,6 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - if not return_dict: - return tuple( - v - for v in [ - hidden_states, - next_decoder_cache, - all_hidden_states, - all_self_attentions, - all_cross_attentions, - ] - if v is not None - ) return BaseModelOutputWithPastAndCrossAttentions( last_hidden_state=hidden_states, past_key_values=next_decoder_cache, @@ -1000,6 +989,7 @@ class PreTrainedModel for layer, heads in heads_to_prune.items(): self.encoder.layer[layer].attention.prune_heads(heads) + @can_return_tuple @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -1020,8 +1010,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: r""" encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if @@ -1046,7 +1035,6 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if self.config.is_decoder: use_cache = use_cache if use_cache is not None else self.config.use_cache @@ -1141,7 +1129,7 @@ def forward( # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) - encoder_outputs = self.encoder( + encoder_outputs: BaseModelOutputWithPastAndCrossAttentions = self.encoder( embedding_output, attention_mask=extended_attention_mask, head_mask=head_mask, @@ -1151,14 +1139,10 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = encoder_outputs[0] + sequence_output = encoder_outputs.last_hidden_state pooled_output = self.pooler(sequence_output) if self.pooler is not None else None - if not return_dict: - return (sequence_output, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPoolingAndCrossAttentions( last_hidden_state=sequence_output, pooler_output=pooled_output, @@ -1195,6 +1179,7 @@ def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings self.cls.predictions.bias = new_embeddings.bias + @can_return_tuple @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=BertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) def forward( @@ -1209,8 +1194,7 @@ def forward( next_sentence_label: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], BertForPreTrainingOutput]: + ) -> BertForPreTrainingOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., @@ -1243,9 +1227,8 @@ def forward( >>> seq_relationship_logits = outputs.seq_relationship_logits ``` """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.bert( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1254,10 +1237,10 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output, pooled_output = outputs[:2] + sequence_output = outputs.last_hidden_state + pooled_output = outputs.pooler_output prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output) total_loss = None @@ -1267,10 +1250,6 @@ def forward( next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) total_loss = masked_lm_loss + next_sentence_loss - if not return_dict: - output = (prediction_scores, seq_relationship_score) + outputs[2:] - return ((total_loss,) + output) if total_loss is not None else output - return BertForPreTrainingOutput( loss=total_loss, prediction_logits=prediction_scores, @@ -1305,6 +1284,7 @@ def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings self.cls.predictions.bias = new_embeddings.bias + @can_return_tuple @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -1326,9 +1306,8 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, **loss_kwargs, - ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]: + ) -> CausalLMOutputWithCrossAttentions: r""" encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if @@ -1353,11 +1332,10 @@ def forward( If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see `past_key_values`). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if labels is not None: use_cache = False - outputs = self.bert( + outputs: BaseModelOutputWithPastAndCrossAttentions = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1370,20 +1348,15 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state prediction_scores = self.cls(sequence_output) lm_loss = None if labels is not None: lm_loss = self.loss_function(prediction_scores, labels, self.config.vocab_size, **loss_kwargs) - if not return_dict: - output = (prediction_scores,) + outputs[2:] - return ((lm_loss,) + output) if lm_loss is not None else output - return CausalLMOutputWithCrossAttentions( loss=lm_loss, logits=prediction_scores, @@ -1428,6 +1401,7 @@ def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings self.cls.predictions.bias = new_embeddings.bias + @can_return_tuple @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -1449,8 +1423,7 @@ def forward( labels: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]: + ) -> MaskedLMOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., @@ -1458,9 +1431,7 @@ def forward( loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.bert( + outputs: BaseModelOutputWithPastAndCrossAttentions = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1471,10 +1442,9 @@ def forward( encoder_attention_mask=encoder_attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state prediction_scores = self.cls(sequence_output) masked_lm_loss = None @@ -1482,10 +1452,6 @@ def forward( loss_fct = CrossEntropyLoss() # -100 index = padding token masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) - if not return_dict: - output = (prediction_scores,) + outputs[2:] - return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output - return MaskedLMOutput( loss=masked_lm_loss, logits=prediction_scores, @@ -1532,6 +1498,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=NextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC) def forward( @@ -1545,9 +1512,8 @@ def forward( labels: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, **kwargs, - ) -> Union[Tuple[torch.Tensor], NextSentencePredictorOutput]: + ) -> NextSentencePredictorOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair @@ -1585,9 +1551,7 @@ def forward( ) labels = kwargs.pop("next_sentence_label") - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.bert( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1596,11 +1560,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - pooled_output = outputs[1] - + pooled_output = outputs.pooler_output seq_relationship_scores = self.cls(pooled_output) next_sentence_loss = None @@ -1608,10 +1570,6 @@ def forward( loss_fct = CrossEntropyLoss() next_sentence_loss = loss_fct(seq_relationship_scores.view(-1, 2), labels.view(-1)) - if not return_dict: - output = (seq_relationship_scores,) + outputs[2:] - return ((next_sentence_loss,) + output) if next_sentence_loss is not None else output - return NextSentencePredictorOutput( loss=next_sentence_loss, logits=seq_relationship_scores, @@ -1643,6 +1601,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION, @@ -1662,17 +1621,14 @@ def forward( labels: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]: + ) -> SequenceClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.bert( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1681,11 +1637,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - pooled_output = outputs[1] - + pooled_output = outputs.pooler_output pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) @@ -1711,9 +1665,6 @@ def forward( elif self.config.problem_type == "multi_label_classification": loss_fct = BCEWithLogitsLoss() loss = loss_fct(logits, labels) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output return SequenceClassifierOutput( loss=loss, @@ -1744,6 +1695,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -1761,15 +1713,13 @@ def forward( labels: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]: + ) -> MultipleChoiceModelOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above) """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None @@ -1782,7 +1732,7 @@ def forward( else None ) - outputs = self.bert( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1791,11 +1741,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - pooled_output = outputs[1] - + pooled_output = outputs.pooler_output pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) reshaped_logits = logits.view(-1, num_choices) @@ -1805,10 +1753,6 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(reshaped_logits, labels) - if not return_dict: - output = (reshaped_logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return MultipleChoiceModelOutput( loss=loss, logits=reshaped_logits, @@ -1839,6 +1783,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_TOKEN_CLASSIFICATION, @@ -1858,15 +1803,12 @@ def forward( labels: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]: + ) -> TokenClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.bert( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1875,11 +1817,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] - + sequence_output = outputs.last_hidden_state sequence_output = self.dropout(sequence_output) logits = self.classifier(sequence_output) @@ -1888,10 +1828,6 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return TokenClassifierOutput( loss=loss, logits=logits, @@ -1918,6 +1854,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_QA, @@ -1940,8 +1877,7 @@ def forward( end_positions: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]: + ) -> QuestionAnsweringModelOutput: r""" start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for position (index) of the start of the labelled span for computing the token classification loss. @@ -1952,9 +1888,7 @@ def forward( Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.bert( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1963,11 +1897,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] - + sequence_output = outputs.last_hidden_state logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) start_logits = start_logits.squeeze(-1).contiguous() @@ -1990,10 +1922,6 @@ def forward( end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 - if not return_dict: - output = (start_logits, end_logits) + outputs[2:] - return ((total_loss,) + output) if total_loss is not None else output - return QuestionAnsweringModelOutput( loss=total_loss, start_logits=start_logits, From 68b08d451d474c822074054b2d08b37ec238f0ca Mon Sep 17 00:00:00 2001 From: qubvel Date: Wed, 9 Apr 2025 13:02:42 +0000 Subject: [PATCH 04/32] bert_generation --- .../modeling_bert_generation.py | 53 +++++-------------- 1 file changed, 12 insertions(+), 41 deletions(-) diff --git a/src/transformers/models/bert_generation/modeling_bert_generation.py b/src/transformers/models/bert_generation/modeling_bert_generation.py index 75fd1b17168e..4282ac6bd607 100755 --- a/src/transformers/models/bert_generation/modeling_bert_generation.py +++ b/src/transformers/models/bert_generation/modeling_bert_generation.py @@ -32,6 +32,7 @@ add_start_docstrings_to_model_forward, logging, replace_return_docstrings, + can_return_tuple, ) from .configuration_bert_generation import BertGenerationConfig @@ -367,7 +368,7 @@ def feed_forward_chunk(self, attention_output): return layer_output -# Copied from transformers.models.bert.modeling_bert.BertEncoder with Bert->BertGeneration +# Copied from transformers.models.bert.modeling_bert.BertEncoder class BertEncoder(nn.Module): def __init__(self, config): super().__init__() @@ -375,6 +376,7 @@ def __init__(self, config): self.layer = nn.ModuleList([BertGenerationLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False + @can_return_tuple def forward( self, hidden_states: torch.Tensor, @@ -386,8 +388,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = False, output_hidden_states: Optional[bool] = False, - return_dict: Optional[bool] = True, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: + ) -> BaseModelOutputWithPastAndCrossAttentions: all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None @@ -440,18 +441,6 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - if not return_dict: - return tuple( - v - for v in [ - hidden_states, - next_decoder_cache, - all_hidden_states, - all_self_attentions, - all_cross_attentions, - ] - if v is not None - ) return BaseModelOutputWithPastAndCrossAttentions( last_hidden_state=hidden_states, past_key_values=next_decoder_cache, @@ -715,6 +704,7 @@ class PreTrainedModel for layer, heads in heads_to_prune.items(): self.encoder.layer[layer].attention.prune_heads(heads) + @can_return_tuple @add_start_docstrings_to_model_forward(BERT_GENERATION_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -734,9 +724,8 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, **kwargs, # NOOP kwargs, for now - ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]: + ) -> BaseModelOutputWithPastAndCrossAttentions: r""" encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if @@ -759,7 +748,6 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if self.config.is_decoder: use_cache = use_cache if use_cache is not None else self.config.use_cache @@ -814,7 +802,7 @@ def forward( past_key_values_length=past_key_values_length, ) - encoder_outputs = self.encoder( + outputs: BaseModelOutputWithPastAndCrossAttentions = self.encoder( embedding_output, attention_mask=extended_attention_mask, head_mask=head_mask, @@ -824,20 +812,9 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = encoder_outputs[0] - if not return_dict: - return (sequence_output,) + encoder_outputs[1:] - - return BaseModelOutputWithPastAndCrossAttentions( - last_hidden_state=sequence_output, - past_key_values=encoder_outputs.past_key_values, - hidden_states=encoder_outputs.hidden_states, - attentions=encoder_outputs.attentions, - cross_attentions=encoder_outputs.cross_attentions, - ) + return outputs class BertGenerationOnlyLMHead(nn.Module): @@ -886,6 +863,7 @@ def set_output_embeddings(self, new_embeddings): self.lm_head.decoder = new_embeddings self.lm_head.bias = new_embeddings.bias + @can_return_tuple @add_start_docstrings_to_model_forward(BERT_GENERATION_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC) def forward( @@ -902,9 +880,8 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, **kwargs, - ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]: + ) -> CausalLMOutputWithCrossAttentions: r""" encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if @@ -949,11 +926,10 @@ def forward( >>> prediction_logits = outputs.logits ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if labels is not None: use_cache = False - outputs = self.bert( + outputs: BaseModelOutputWithPastAndCrossAttentions = self.bert( input_ids, attention_mask=attention_mask, position_ids=position_ids, @@ -965,11 +941,10 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, **kwargs, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state prediction_scores = self.lm_head(sequence_output) lm_loss = None @@ -981,10 +956,6 @@ def forward( **kwargs, ) - if not return_dict: - output = (prediction_scores,) + outputs[1:] - return ((lm_loss,) + output) if lm_loss is not None else output - return CausalLMOutputWithCrossAttentions( loss=lm_loss, logits=prediction_scores, From 7584bd1db07e2b8462e9eab906ae17ef0528e83a Mon Sep 17 00:00:00 2001 From: qubvel Date: Wed, 9 Apr 2025 13:20:39 +0000 Subject: [PATCH 05/32] Fixup --- .../models/bert_generation/modeling_bert_generation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/bert_generation/modeling_bert_generation.py b/src/transformers/models/bert_generation/modeling_bert_generation.py index 4282ac6bd607..0b380ee61019 100755 --- a/src/transformers/models/bert_generation/modeling_bert_generation.py +++ b/src/transformers/models/bert_generation/modeling_bert_generation.py @@ -15,7 +15,7 @@ """PyTorch BERT model specific for generation.""" import math -from typing import Optional, Tuple, Union +from typing import Optional, Tuple import torch import torch.utils.checkpoint @@ -30,9 +30,9 @@ add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, + can_return_tuple, logging, replace_return_docstrings, - can_return_tuple, ) from .configuration_bert_generation import BertGenerationConfig From 1feae644455a80db89b552dfcf8e609beb35d637 Mon Sep 17 00:00:00 2001 From: qubvel Date: Wed, 9 Apr 2025 13:21:42 +0000 Subject: [PATCH 06/32] chinese_clip --- .../chinese_clip/modeling_chinese_clip.py | 126 +++++------------- 1 file changed, 32 insertions(+), 94 deletions(-) diff --git a/src/transformers/models/chinese_clip/modeling_chinese_clip.py b/src/transformers/models/chinese_clip/modeling_chinese_clip.py index 647e8f1c2421..b4fffdc74e97 100644 --- a/src/transformers/models/chinese_clip/modeling_chinese_clip.py +++ b/src/transformers/models/chinese_clip/modeling_chinese_clip.py @@ -16,7 +16,7 @@ import math from dataclasses import dataclass -from typing import Any, List, Optional, Tuple, Union +from typing import Any, List, Optional, Tuple import torch import torch.utils.checkpoint @@ -36,6 +36,7 @@ add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, + can_return_tuple, logging, replace_return_docstrings, torch_int, @@ -924,6 +925,7 @@ def __init__(self, config): self.layer = nn.ModuleList([ChineseCLIPTextLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False + @can_return_tuple def forward( self, hidden_states: torch.Tensor, @@ -935,8 +937,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = False, output_hidden_states: Optional[bool] = False, - return_dict: Optional[bool] = True, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: + ) -> BaseModelOutputWithPastAndCrossAttentions: all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None @@ -989,18 +990,6 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - if not return_dict: - return tuple( - v - for v in [ - hidden_states, - next_decoder_cache, - all_hidden_states, - all_self_attentions, - all_cross_attentions, - ] - if v is not None - ) return BaseModelOutputWithPastAndCrossAttentions( last_hidden_state=hidden_states, past_key_values=next_decoder_cache, @@ -1025,13 +1014,13 @@ def __init__(self, config: ChineseCLIPConfig): self.layers = nn.ModuleList([ChineseCLIPVisionLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False + @can_return_tuple def forward( self, inputs_embeds, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutput]: + ) -> BaseModelOutput: r""" Args: inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): @@ -1051,7 +1040,6 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict encoder_states = () if output_hidden_states else None all_attentions = () if output_attentions else None @@ -1080,8 +1068,6 @@ def forward( if output_hidden_states: encoder_states = encoder_states + (hidden_states,) - if not return_dict: - return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) return BaseModelOutput( last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions ) @@ -1098,6 +1084,7 @@ def __init__(self, config: ChineseCLIPVisionConfig): self.encoder = ChineseCLIPVisionEncoder(config) self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) + @can_return_tuple @add_start_docstrings_to_model_forward(CHINESE_CLIP_VISION_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=ChineseCLIPVisionConfig) def forward( @@ -1106,8 +1093,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, interpolate_pos_encoding: bool = False, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPooling]: + ) -> BaseModelOutputWithPooling: r""" Returns: """ @@ -1115,7 +1101,6 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -1123,20 +1108,16 @@ def forward( hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding) hidden_states = self.pre_layrnorm(hidden_states) - encoder_outputs = self.encoder( + encoder_outputs: BaseModelOutput = self.encoder( inputs_embeds=hidden_states, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - last_hidden_state = encoder_outputs[0] + last_hidden_state = encoder_outputs.last_hidden_state pooled_output = last_hidden_state[:, 0, :] pooled_output = self.post_layernorm(pooled_output) - if not return_dict: - return (last_hidden_state, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPooling( last_hidden_state=last_hidden_state, pooler_output=pooled_output, @@ -1191,6 +1172,7 @@ class PreTrainedModel for layer, heads in heads_to_prune.items(): self.encoder.layer[layer].attention.prune_heads(heads) + @can_return_tuple @add_start_docstrings_to_model_forward(CHINESE_CLIP_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -1211,8 +1193,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: r""" encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if @@ -1237,7 +1218,6 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if self.config.is_decoder: use_cache = use_cache if use_cache is not None else self.config.use_cache @@ -1300,7 +1280,7 @@ def forward( inputs_embeds=inputs_embeds, past_key_values_length=past_key_values_length, ) - encoder_outputs = self.encoder( + encoder_outputs: BaseModelOutputWithPastAndCrossAttentions = self.encoder( embedding_output, attention_mask=extended_attention_mask, head_mask=head_mask, @@ -1310,14 +1290,10 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = encoder_outputs[0] + sequence_output = encoder_outputs.last_hidden_state pooled_output = self.pooler(sequence_output) if self.pooler is not None else None - if not return_dict: - return (sequence_output, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPoolingAndCrossAttentions( last_hidden_state=sequence_output, pooler_output=pooled_output, @@ -1346,6 +1322,7 @@ def __init__(self, config: ChineseCLIPVisionConfig): def get_input_embeddings(self) -> nn.Module: return self.vision_model.embeddings.patch_embedding + @can_return_tuple @add_start_docstrings_to_model_forward(CHINESE_CLIP_VISION_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=ChineseCLIPVisionConfig) def forward( @@ -1354,8 +1331,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, interpolate_pos_encoding: bool = False, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPooling]: + ) -> BaseModelOutputWithPooling: r""" Returns: @@ -1378,14 +1354,11 @@ def forward( >>> last_hidden_state = outputs.last_hidden_state >>> pooled_output = outputs.pooler_output # pooled CLS states ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - return self.vision_model( pixel_values=pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, interpolate_pos_encoding=interpolate_pos_encoding, - return_dict=return_dict, ) @@ -1425,6 +1398,7 @@ def __init__(self, config: ChineseCLIPConfig): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(CHINESE_CLIP_TEXT_INPUTS_DOCSTRING) def get_text_features( self, @@ -1432,9 +1406,6 @@ def get_text_features( attention_mask: Optional[torch.Tensor] = None, token_type_ids: Optional[torch.Tensor] = None, position_ids: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, ) -> torch.FloatTensor: r""" Returns: @@ -1453,36 +1424,24 @@ def get_text_features( >>> text_features = model.get_text_features(**inputs) >>> text_features = text_features / text_features.norm(p=2, dim=-1, keepdim=True) ```""" - # Use CHINESE_CLIP model's config for some fields (if specified) instead of those of vision & text components. - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - text_outputs = self.text_model( + text_outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.text_model( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + output_attentions=False, + output_hidden_states=False, ) - - pooled_output = text_outputs[0][:, 0, :] + pooled_output = text_outputs.last_hidden_state[:, 0, :] text_features = self.text_projection(pooled_output) - return text_features + @can_return_tuple @add_start_docstrings_to_model_forward(CHINESE_CLIP_VISION_INPUTS_DOCSTRING) def get_image_features( self, pixel_values: Optional[torch.FloatTensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, interpolate_pos_encoding: bool = False, - return_dict: Optional[bool] = None, ) -> torch.FloatTensor: r""" Returns: @@ -1507,26 +1466,17 @@ def get_image_features( >>> image_features = model.get_image_features(**inputs) >>> image_features = image_features / image_features.norm(p=2, dim=-1, keepdim=True) ```""" - # Use CHINESE_CLIP model's config for some fields (if specified) instead of those of vision & text components. - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - vision_outputs = self.vision_model( + vision_outputs: BaseModelOutputWithPooling = self.vision_model( pixel_values=pixel_values, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, + output_attentions=False, + output_hidden_states=False, interpolate_pos_encoding=interpolate_pos_encoding, - return_dict=return_dict, ) - - pooled_output = vision_outputs[1] # pooled_output + pooled_output = vision_outputs.pooler_output image_features = self.visual_projection(pooled_output) - return image_features + @can_return_tuple @add_start_docstrings_to_model_forward(CHINESE_CLIP_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=ChineseCLIPOutput, config_class=ChineseCLIPConfig) def forward( @@ -1540,8 +1490,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, interpolate_pos_encoding: bool = False, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, ChineseCLIPOutput]: + ) -> ChineseCLIPOutput: r""" Returns: @@ -1569,30 +1518,27 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - vision_outputs = self.vision_model( + vision_outputs: BaseModelOutputWithPooling = self.vision_model( pixel_values=pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, interpolate_pos_encoding=interpolate_pos_encoding, - return_dict=return_dict, ) - text_outputs = self.text_model( + text_outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.text_model( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - image_embeds = vision_outputs[1] + image_embeds = vision_outputs.pooler_output image_embeds = self.visual_projection(image_embeds) - text_embeds = text_outputs[0][:, 0, :] + text_embeds = text_outputs.last_hidden_state[:, 0, :] text_embeds = self.text_projection(text_embeds) # normalized features @@ -1608,14 +1554,6 @@ def forward( if return_loss: loss = chinese_clip_loss(logits_per_text) - if not return_dict: - # fix the None pooled_output of text_outputs to conform with dict_output - pooled_output = text_outputs[1] - if pooled_output is None: - text_outputs = (text_outputs[0],) + text_outputs[2:] - output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs) - return ((loss,) + output) if loss is not None else output - return ChineseCLIPOutput( loss=loss, logits_per_image=logits_per_image, From a403e491d77e8590841988376cfbbb95bf101585 Mon Sep 17 00:00:00 2001 From: qubvel Date: Wed, 9 Apr 2025 13:29:51 +0000 Subject: [PATCH 07/32] clap --- src/transformers/models/clap/modeling_clap.py | 138 ++++-------------- 1 file changed, 32 insertions(+), 106 deletions(-) diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py index b2fdf0dd7eeb..7a3e859ae4d2 100644 --- a/src/transformers/models/clap/modeling_clap.py +++ b/src/transformers/models/clap/modeling_clap.py @@ -35,6 +35,7 @@ ModelOutput, add_start_docstrings, add_start_docstrings_to_model_forward, + can_return_tuple, logging, replace_return_docstrings, torch_int, @@ -892,6 +893,7 @@ def reshape_mel2img(self, normalized_input_features): return normalized_input_features + @can_return_tuple def forward( self, input_features, @@ -901,8 +903,7 @@ def forward( output_hidden_states: Optional[bool] = False, output_hidden_states_before_downsampling: Optional[bool] = False, always_partition: Optional[bool] = False, - return_dict: Optional[bool] = True, - ) -> Union[Tuple, ClapAudioModelOutput]: + ) -> BaseModelOutputWithPooling: input_features = input_features.transpose(1, 3) normalized_input_features = self.batch_norm(input_features) normalized_input_features = normalized_input_features.transpose(1, 3) @@ -997,18 +998,6 @@ def forward( latent_output = self.avgpool(torch.flatten(last_hidden_state, 2)) latent_output = torch.flatten(latent_output, 1) - if not return_dict: - return tuple( - v - for v in [ - last_hidden_state, - latent_output, - all_reshaped_hidden_states, - all_self_attentions, - ] - if v is not None - ) - return BaseModelOutputWithPooling( last_hidden_state=last_hidden_state, pooler_output=latent_output, @@ -1561,6 +1550,7 @@ def __init__(self, config): self.layer = nn.ModuleList([ClapTextLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False + @can_return_tuple def forward( self, hidden_states: torch.Tensor, @@ -1572,8 +1562,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = False, output_hidden_states: Optional[bool] = False, - return_dict: Optional[bool] = True, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: + ) -> BaseModelOutputWithPastAndCrossAttentions: all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None @@ -1626,18 +1615,6 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - if not return_dict: - return tuple( - v - for v in [ - hidden_states, - next_decoder_cache, - all_hidden_states, - all_self_attentions, - all_cross_attentions, - ] - if v is not None - ) return BaseModelOutputWithPastAndCrossAttentions( last_hidden_state=hidden_states, past_key_values=next_decoder_cache, @@ -1709,6 +1686,7 @@ def __init__(self, config: ClapAudioConfig): def get_input_embeddings(self) -> nn.Module: return self.audio_encoder.patch_embed.proj + @can_return_tuple @add_start_docstrings_to_model_forward(CLAP_AUDIO_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=ClapAudioConfig) def forward( @@ -1717,8 +1695,7 @@ def forward( is_longer: Optional[torch.BoolTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPooling]: + ) -> BaseModelOutputWithPooling: r""" Returns: @@ -1739,7 +1716,6 @@ def forward( >>> outputs = model(**inputs) >>> last_hidden_state = outputs.last_hidden_state ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -1750,7 +1726,6 @@ def forward( is_longer=is_longer, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) @@ -1790,6 +1765,7 @@ def get_input_embeddings(self): def set_input_embeddings(self, value): self.embeddings.word_embeddings = value + @can_return_tuple def forward( self, input_ids: Optional[torch.Tensor] = None, @@ -1804,8 +1780,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: r""" encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if @@ -1830,7 +1805,6 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if self.config.is_decoder: use_cache = use_cache if use_cache is not None else self.config.use_cache @@ -1893,7 +1867,7 @@ def forward( inputs_embeds=inputs_embeds, past_key_values_length=past_key_values_length, ) - encoder_outputs = self.encoder( + encoder_outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.encoder( embedding_output, attention_mask=extended_attention_mask, head_mask=head_mask, @@ -1903,14 +1877,10 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = encoder_outputs[0] + sequence_output = encoder_outputs.last_hidden_state pooled_output = self.pooler(sequence_output) if self.pooler is not None else None - if not return_dict: - return (sequence_output, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPoolingAndCrossAttentions( last_hidden_state=sequence_output, pooler_output=pooled_output, @@ -1963,10 +1933,7 @@ def get_text_features( input_ids: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> torch.FloatTensor: + ) -> torch.Tensor: r""" Returns: text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by @@ -1983,23 +1950,15 @@ def get_text_features( >>> inputs = tokenizer(["the sound of a cat", "the sound of a dog"], padding=True, return_tensors="pt") >>> text_features = model.get_text_features(**inputs) ```""" - # Use CLAP model's config for some fields (if specified) instead of those of audio & text components. - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - text_outputs = self.text_model( + text_outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.text_model( input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + output_attentions=False, + output_hidden_states=False, ) - pooled_output = text_outputs[1] if return_dict is not None else text_outputs.pooler_output + pooled_output = text_outputs.pooler_output text_features = self.text_projection(pooled_output) text_features = F.normalize(text_features, dim=-1) @@ -2010,11 +1969,7 @@ def get_audio_features( self, input_features: Optional[torch.Tensor] = None, is_longer: Optional[torch.Tensor] = None, - attention_mask: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> torch.FloatTensor: + ) -> torch.Tensor: r""" Returns: audio_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The audio embeddings obtained by @@ -2032,25 +1987,19 @@ def get_audio_features( >>> inputs = feature_extractor(random_audio, return_tensors="pt") >>> audio_features = model.get_audio_features(**inputs) ```""" - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - audio_outputs = self.audio_model( + audio_outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.audio_model( input_features=input_features, is_longer=is_longer, - return_dict=return_dict, ) - pooled_output = audio_outputs[1] if not return_dict else audio_outputs.pooler_output - + pooled_output = audio_outputs.pooler_output audio_features = self.audio_projection(pooled_output) audio_features = F.normalize(audio_features, dim=-1) return audio_features + @can_return_tuple @add_start_docstrings_to_model_forward(CLAP_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=ClapOutput, config_class=ClapConfig) def forward( @@ -2063,7 +2012,6 @@ def forward( return_loss: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, ) -> Union[Tuple, ClapOutput]: r""" Returns: @@ -2093,29 +2041,26 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - audio_outputs = self.audio_model( + audio_outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.audio_model( input_features=input_features, is_longer=is_longer, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - text_outputs = self.text_model( + text_outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.text_model( input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - audio_embeds = audio_outputs[1] if not return_dict else audio_outputs.pooler_output + audio_embeds = audio_outputs.pooler_output audio_embeds = self.audio_projection(audio_embeds) - text_embeds = text_outputs[1] if not return_dict else text_outputs.pooler_output + text_embeds = text_outputs.pooler_output text_embeds = self.text_projection(text_embeds) # normalized features @@ -2134,10 +2079,6 @@ def forward( audio_loss = contrastive_loss(logits_per_audio.t()) loss = (caption_loss + audio_loss) / 2.0 - if not return_dict: - output = (logits_per_audio, logits_per_text, text_embeds, audio_embeds, text_outputs, audio_outputs) - return ((loss,) + output) if loss is not None else output - return ClapOutput( loss=loss, logits_per_audio=logits_per_audio, @@ -2171,6 +2112,7 @@ def get_input_embeddings(self) -> nn.Module: def set_input_embeddings(self, value): self.text_model.embeddings.word_embeddings = value + @can_return_tuple @add_start_docstrings_to_model_forward(CLAP_TEXT_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=ClapTextModelOutput, config_class=ClapTextConfig) def forward( @@ -2180,8 +2122,7 @@ def forward( position_ids: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, ClapTextModelOutput]: + ) -> ClapTextModelOutput: r""" Returns: @@ -2198,25 +2139,17 @@ def forward( >>> outputs = model(**inputs) >>> text_embeds = outputs.text_embeds ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - text_outputs = self.text_model( + text_outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.text_model( input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - - pooled_output = text_outputs[1] if not return_dict else text_outputs.pooler_output - + pooled_output = text_outputs.pooler_output text_embeds = self.text_projection(pooled_output) - if not return_dict: - outputs = (text_embeds, text_outputs[0]) + text_outputs[2:] - return tuple(output for output in outputs if output is not None) - return ClapTextModelOutput( text_embeds=text_embeds, last_hidden_state=text_outputs.last_hidden_state, @@ -2245,6 +2178,7 @@ def __init__(self, config: ClapAudioConfig): def get_input_embeddings(self) -> nn.Module: return self.audio_model.audio_encoder.patch_embed.proj + @can_return_tuple @add_start_docstrings_to_model_forward(CLAP_AUDIO_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=ClapAudioModelOutput, config_class=ClapAudioConfig) def forward( @@ -2253,8 +2187,7 @@ def forward( is_longer: Optional[torch.BoolTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, ClapAudioModelOutput]: + ) -> ClapAudioModelOutput: r""" Returns: @@ -2274,28 +2207,21 @@ def forward( >>> outputs = model(**inputs) >>> audio_embeds = outputs.audio_embeds ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - audio_outputs = self.audio_model( + audio_outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.audio_model( input_features=input_features, is_longer=is_longer, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - pooled_output = audio_outputs[1] if not return_dict else audio_outputs.pooler_output - + pooled_output = audio_outputs.pooler_output audio_embeds = self.audio_projection(pooled_output) - if not return_dict: - outputs = (audio_embeds, audio_outputs[0]) + audio_outputs[2:] - return tuple(output for output in outputs if output is not None) - return ClapAudioModelOutput( audio_embeds=audio_embeds, last_hidden_state=audio_outputs.last_hidden_state, From cd8c9fc9a53b65d411b5922159c1c2a722c1f23d Mon Sep 17 00:00:00 2001 From: qubvel Date: Wed, 9 Apr 2025 13:32:32 +0000 Subject: [PATCH 08/32] fix --- .../models/bert_generation/modeling_bert_generation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/bert_generation/modeling_bert_generation.py b/src/transformers/models/bert_generation/modeling_bert_generation.py index 0b380ee61019..ea1c1843de97 100755 --- a/src/transformers/models/bert_generation/modeling_bert_generation.py +++ b/src/transformers/models/bert_generation/modeling_bert_generation.py @@ -368,7 +368,7 @@ def feed_forward_chunk(self, attention_output): return layer_output -# Copied from transformers.models.bert.modeling_bert.BertEncoder +# Copied from transformers.models.bert.modeling_bert.BertEncoder with BertLayer->BertGenerationLayer class BertEncoder(nn.Module): def __init__(self, config): super().__init__() From 6d5e282c8a6e8b21d6bb311fb15abe349a787656 Mon Sep 17 00:00:00 2001 From: qubvel Date: Wed, 9 Apr 2025 13:43:03 +0000 Subject: [PATCH 09/32] altclip --- .../models/altclip/modeling_altclip.py | 140 +++++------------- 1 file changed, 40 insertions(+), 100 deletions(-) diff --git a/src/transformers/models/altclip/modeling_altclip.py b/src/transformers/models/altclip/modeling_altclip.py index 6e4c9e650da0..bd2e6895270c 100755 --- a/src/transformers/models/altclip/modeling_altclip.py +++ b/src/transformers/models/altclip/modeling_altclip.py @@ -16,7 +16,7 @@ import math from dataclasses import dataclass -from typing import Any, List, Optional, Tuple, Union +from typing import Any, List, Optional, Tuple import torch import torch.nn as nn @@ -32,7 +32,14 @@ ) from ...modeling_utils import PreTrainedModel from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer -from ...utils import ModelOutput, add_start_docstrings_to_model_forward, logging, replace_return_docstrings, torch_int +from ...utils import ( + ModelOutput, + add_start_docstrings_to_model_forward, + can_return_tuple, + logging, + replace_return_docstrings, + torch_int, +) from .configuration_altclip import AltCLIPConfig, AltCLIPTextConfig, AltCLIPVisionConfig @@ -619,6 +626,7 @@ def __init__(self, config): self.layer = nn.ModuleList([AltRobertaLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False + @can_return_tuple def forward( self, hidden_states: torch.Tensor, @@ -630,8 +638,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = False, output_hidden_states: Optional[bool] = False, - return_dict: Optional[bool] = True, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: + ) -> BaseModelOutputWithPastAndCrossAttentions: all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None @@ -684,18 +691,6 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - if not return_dict: - return tuple( - v - for v in [ - hidden_states, - next_decoder_cache, - all_hidden_states, - all_self_attentions, - all_cross_attentions, - ] - if v is not None - ) return BaseModelOutputWithPastAndCrossAttentions( last_hidden_state=hidden_states, past_key_values=next_decoder_cache, @@ -907,6 +902,7 @@ def __init__(self, config: AltCLIPConfig): self.layers = nn.ModuleList([AltCLIPEncoderLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False + @can_return_tuple def forward( self, inputs_embeds, @@ -914,8 +910,7 @@ def forward( causal_attention_mask: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutput]: + ) -> BaseModelOutput: r""" Args: inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): @@ -949,7 +944,6 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict encoder_states = () if output_hidden_states else None all_attentions = () if output_attentions else None @@ -982,8 +976,6 @@ def forward( if output_hidden_states: encoder_states = encoder_states + (hidden_states,) - if not return_dict: - return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) return BaseModelOutput( last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions ) @@ -1141,6 +1133,7 @@ def __init__(self, config: AltCLIPVisionConfig): self.encoder = AltCLIPEncoder(config) self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) + @can_return_tuple @add_start_docstrings_to_model_forward(ALTCLIP_VISION_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=AltCLIPVisionConfig) def forward( @@ -1148,9 +1141,8 @@ def forward( pixel_values: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, interpolate_pos_encoding: Optional[bool] = False, - ) -> Union[Tuple, BaseModelOutputWithPooling]: + ) -> BaseModelOutputWithPooling: r""" Returns: @@ -1159,7 +1151,6 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -1167,20 +1158,16 @@ def forward( hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding) hidden_states = self.pre_layrnorm(hidden_states) - encoder_outputs = self.encoder( + encoder_outputs: BaseModelOutput = self.encoder( inputs_embeds=hidden_states, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - last_hidden_state = encoder_outputs[0] + last_hidden_state = encoder_outputs.last_hidden_state pooled_output = last_hidden_state[:, 0, :] pooled_output = self.post_layernorm(pooled_output) - if not return_dict: - return (last_hidden_state, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPooling( last_hidden_state=last_hidden_state, pooler_output=pooled_output, @@ -1202,6 +1189,7 @@ def __init__(self, config: AltCLIPVisionConfig): def get_input_embeddings(self) -> nn.Module: return self.vision_model.embeddings.patch_embedding + @can_return_tuple @add_start_docstrings_to_model_forward(ALTCLIP_VISION_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=AltCLIPVisionConfig) def forward( @@ -1210,8 +1198,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, interpolate_pos_encoding: bool = False, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPooling]: + ) -> BaseModelOutputWithPooling: r""" Returns: @@ -1234,14 +1221,11 @@ def forward( >>> last_hidden_state = outputs.last_hidden_state >>> pooled_output = outputs.pooler_output # pooled CLS states ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - return self.vision_model( pixel_values=pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, interpolate_pos_encoding=interpolate_pos_encoding, - return_dict=return_dict, ) @@ -1305,8 +1289,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: r""" encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if @@ -1331,7 +1314,6 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if self.config.is_decoder: use_cache = use_cache if use_cache is not None else self.config.use_cache @@ -1394,7 +1376,7 @@ def forward( inputs_embeds=inputs_embeds, past_key_values_length=past_key_values_length, ) - encoder_outputs = self.encoder( + encoder_outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.encoder( embedding_output, attention_mask=extended_attention_mask, head_mask=head_mask, @@ -1404,14 +1386,10 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = encoder_outputs[0] + sequence_output = encoder_outputs.last_hidden_state pooled_output = self.pooler(sequence_output) if self.pooler is not None else None - if not return_dict: - return (sequence_output, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPoolingAndCrossAttentions( last_hidden_state=sequence_output, pooler_output=pooled_output, @@ -1441,6 +1419,7 @@ def set_input_embeddings(self, value: nn.Embedding) -> None: def resize_token_embeddings(self, new_num_tokens: Optional[int] = None) -> nn.Embedding: return super().resize_token_embeddings(new_num_tokens) + @can_return_tuple @add_start_docstrings_to_model_forward(ALTCLIP_TEXT_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=BaseModelOutputWithPoolingAndProjection, config_class=AltCLIPTextConfig) def forward( @@ -1454,9 +1433,8 @@ def forward( encoder_hidden_states: Optional[torch.Tensor] = None, encoder_attention_mask: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, - return_dict: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPoolingAndProjection]: + ) -> BaseModelOutputWithPoolingAndProjection: r""" Returns: @@ -1476,10 +1454,7 @@ def forward( >>> last_hidden_state = outputs.last_hidden_state >>> pooled_output = outputs.pooler_output # pooled CLS states ```""" - - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.roberta( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roberta( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1490,11 +1465,10 @@ def forward( encoder_attention_mask=encoder_attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) # last module outputs - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state # project every module sequence_output = self.pre_LN(sequence_output) @@ -1503,9 +1477,6 @@ def forward( projection_state = self.transformation(sequence_output) pooler_output = projection_state[:, 0] - if not return_dict: - return (projection_state, pooler_output) + outputs[2:4] - return BaseModelOutputWithPoolingAndProjection( last_hidden_state=projection_state, pooler_output=pooler_output, @@ -1555,9 +1526,6 @@ def get_text_features( attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.Tensor] = None, token_type_ids=None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, ) -> torch.FloatTensor: r""" Returns: @@ -1574,23 +1542,15 @@ def get_text_features( >>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt") >>> text_features = model.get_text_features(**inputs) ```""" - # Use AltCLIP model's config for some fields (if specified) instead of those of vision & text components. - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - text_outputs = self.text_model( + text_outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.text_model( input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, token_type_ids=token_type_ids, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + output_attentions=False, + output_hidden_states=False, ) - pooled_output = text_outputs[1] + pooled_output = text_outputs.pooler_output text_features = self.text_projection(pooled_output) return text_features @@ -1599,10 +1559,7 @@ def get_text_features( def get_image_features( self, pixel_values: Optional[torch.FloatTensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, interpolate_pos_encoding: bool = False, - return_dict: Optional[bool] = None, ) -> torch.FloatTensor: r""" Returns: @@ -1623,26 +1580,17 @@ def get_image_features( >>> inputs = processor(images=image, return_tensors="pt") >>> image_features = model.get_image_features(**inputs) ```""" - # Use AltCLIP model's config for some fields (if specified) instead of those of vision & text components. - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - vision_outputs = self.vision_model( + vision_outputs: BaseModelOutputWithPooling = self.vision_model( pixel_values=pixel_values, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, + output_attentions=False, + output_hidden_states=False, interpolate_pos_encoding=interpolate_pos_encoding, - return_dict=return_dict, ) - - pooled_output = vision_outputs[1] # pooled_output + pooled_output = vision_outputs.pooler_output image_features = self.visual_projection(pooled_output) - return image_features + @can_return_tuple @add_start_docstrings_to_model_forward(ALTCLIP_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=AltCLIPOutput, config_class=AltCLIPConfig) def forward( @@ -1656,8 +1604,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, interpolate_pos_encoding: bool = False, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, AltCLIPOutput]: + ) -> AltCLIPOutput: r""" Returns: @@ -1684,30 +1631,27 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - text_outputs = self.text_model( + text_outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.text_model( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - vision_outputs = self.vision_model( + vision_outputs: BaseModelOutputWithPooling = self.vision_model( pixel_values=pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, interpolate_pos_encoding=interpolate_pos_encoding, - return_dict=return_dict, ) - image_embeds = vision_outputs[1] + image_embeds = vision_outputs.pooler_output image_embeds = self.visual_projection(image_embeds) - text_embeds = text_outputs[1] + text_embeds = text_outputs.pooler_output text_embeds = self.text_projection(text_embeds) # normalized features @@ -1723,10 +1667,6 @@ def forward( if return_loss: loss = clip_loss(logits_per_text) - if not return_dict: - output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs) - return ((loss,) + output) if loss is not None else output - return AltCLIPOutput( loss=loss, logits_per_image=logits_per_image, From e307191facdc3b0328a63d8bc8e99893c4d414fa Mon Sep 17 00:00:00 2001 From: qubvel Date: Wed, 9 Apr 2025 13:48:26 +0000 Subject: [PATCH 10/32] bridgetower --- .../bridgetower/modeling_bridgetower.py | 83 ++++--------------- 1 file changed, 18 insertions(+), 65 deletions(-) diff --git a/src/transformers/models/bridgetower/modeling_bridgetower.py b/src/transformers/models/bridgetower/modeling_bridgetower.py index 524b4caa7431..21bb7c9e09d2 100644 --- a/src/transformers/models/bridgetower/modeling_bridgetower.py +++ b/src/transformers/models/bridgetower/modeling_bridgetower.py @@ -37,6 +37,7 @@ from ...utils import ( add_start_docstrings, add_start_docstrings_to_model_forward, + can_return_tuple, logging, replace_return_docstrings, torch_int, @@ -837,6 +838,7 @@ def __init__(self, config): self.layer = nn.ModuleList([BridgeTowerTextLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False + @can_return_tuple def forward( self, hidden_states: torch.Tensor, @@ -848,8 +850,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = False, output_hidden_states: Optional[bool] = False, - return_dict: Optional[bool] = True, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: + ) -> BaseModelOutputWithPastAndCrossAttentions: all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None @@ -902,18 +903,6 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - if not return_dict: - return tuple( - v - for v in [ - hidden_states, - next_decoder_cache, - all_hidden_states, - all_self_attentions, - all_cross_attentions, - ] - if v is not None - ) return BaseModelOutputWithPastAndCrossAttentions( last_hidden_state=hidden_states, past_key_values=next_decoder_cache, @@ -1143,8 +1132,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: r""" encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if @@ -1169,7 +1157,6 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if self.config.is_decoder: use_cache = use_cache if use_cache is not None else self.config.use_cache @@ -1232,7 +1219,7 @@ def forward( inputs_embeds=inputs_embeds, past_key_values_length=past_key_values_length, ) - encoder_outputs = self.encoder( + encoder_outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.encoder( embedding_output, attention_mask=extended_attention_mask, head_mask=head_mask, @@ -1242,14 +1229,10 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = encoder_outputs[0] + sequence_output = encoder_outputs.last_hidden_state pooled_output = self.pooler(sequence_output) if self.pooler is not None else None - if not return_dict: - return (sequence_output, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPoolingAndCrossAttentions( last_hidden_state=sequence_output, pooler_output=pooled_output, @@ -1328,6 +1311,7 @@ def get_input_embeddings(self): def set_input_embeddings(self, value): self.text_model.set_input_embeddings(value) + @can_return_tuple @add_start_docstrings_to_model_forward(BRIDGETOWER_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=BridgeTowerModelOutput, config_class=_CONFIG_FOR_DOC) def forward( @@ -1343,10 +1327,9 @@ def forward( image_token_type_idx: Optional[int] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, labels: Optional[torch.LongTensor] = None, interpolate_pos_encoding: bool = False, - ) -> Union[Tuple[torch.Tensor], BridgeTowerModelOutput]: + ) -> BridgeTowerModelOutput: r""" output_hidden_states (`bool`, *optional*): If set to `True`, hidden states are returned as a list containing the hidden states of text, image, and @@ -1393,7 +1376,6 @@ def forward( "BridgeTowerModel does not use `inputs_embeds`. Make sure to pass in `input_ids` instead." ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict image_token_type_idx = image_token_type_idx if image_token_type_idx else 1 input_shape = input_ids.size() text_embeds = self.text_model.embeddings(input_ids=input_ids) @@ -1547,13 +1529,6 @@ def forward( if output_hidden_states: all_hidden_states = (all_hidden_states_text, all_hidden_states_image, all_hidden_states_cross) - if not return_dict: - return tuple( - v - for v in [text_features, image_features, cls_features, all_hidden_states, all_self_attentions] - if v is not None - ) - return BridgeTowerModelOutput( text_features=text_features, image_features=image_features, @@ -1636,6 +1611,7 @@ def get_output_embeddings(self): def set_output_embeddings(self, new_embeddings): self.mlm_score.decoder = new_embeddings + @can_return_tuple @add_start_docstrings_to_model_forward(BRIDGETOWER_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC) def forward( @@ -1650,7 +1626,6 @@ def forward( image_embeds: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, labels: Optional[torch.LongTensor] = None, ) -> Union[MaskedLMOutput, Tuple[torch.FloatTensor]]: r""" @@ -1685,8 +1660,7 @@ def forward( >>> print(results) .a cat looking out of the window. ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.bridgetower( + outputs: BridgeTowerModelOutput = self.bridgetower( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1697,10 +1671,9 @@ def forward( image_embeds=image_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - mlm_logits = self.mlm_score(outputs.text_features if return_dict else outputs[0]) + mlm_logits = self.mlm_score(outputs.text_features) masked_lm_loss = None if labels is not None: loss_fct = CrossEntropyLoss() # -100 index = padding token @@ -1708,10 +1681,6 @@ def forward( labels = labels.to(mlm_logits.device) masked_lm_loss = loss_fct(mlm_logits.view(-1, self.config.text_config.vocab_size), labels.view(-1)) - if not return_dict: - output = tuple(mlm_logits) - return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output - return MaskedLMOutput( loss=masked_lm_loss, logits=mlm_logits, @@ -1738,6 +1707,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(BRIDGETOWER_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=SequenceClassifierOutput, config_class=_CONFIG_FOR_DOC) def forward( @@ -1752,7 +1722,6 @@ def forward( image_embeds: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, labels: Optional[torch.LongTensor] = None, ) -> Union[SequenceClassifierOutput, Tuple[torch.FloatTensor]]: r""" @@ -1783,9 +1752,8 @@ def forward( ... outputs = model(**encoding) ... scores[text] = outputs.logits[0, 1].item() ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.bridgetower( + outputs: BridgeTowerModelOutput = self.bridgetower( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1796,11 +1764,9 @@ def forward( image_embeds=image_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - pooler_output = outputs.pooler_output if return_dict else outputs[2] - + pooler_output = outputs.pooler_output logits = self.itm_score(pooler_output) itm_loss = None @@ -1810,10 +1776,6 @@ def forward( labels = labels.to(logits.device) itm_loss = loss_fct(logits, labels) - if not return_dict: - output = tuple(logits) - return ((itm_loss,) + output) if itm_loss is not None else output - return SequenceClassifierOutput( loss=itm_loss, logits=logits, @@ -1852,6 +1814,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(BRIDGETOWER_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=BridgeTowerContrastiveOutput, config_class=_CONFIG_FOR_DOC) def forward( @@ -1866,7 +1829,6 @@ def forward( image_embeds: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = True, - return_dict: Optional[bool] = None, return_loss: Optional[bool] = None, ) -> Union[BridgeTowerContrastiveOutput, Tuple[torch.FloatTensor]]: r""" @@ -1904,9 +1866,8 @@ def forward( >>> print("Loss with swapped images", round(loss_swapped.item(), 4)) Loss with swapped images 2.126 ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.bridgetower( + outputs: BridgeTowerModelOutput = self.bridgetower( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1917,13 +1878,9 @@ def forward( image_embeds=image_embeds, output_attentions=output_attentions, output_hidden_states=True, - return_dict=return_dict, - ) - - pooler_output = outputs.pooler_output if return_dict else outputs[2] - hidden_states_txt, hidden_states_img, hidden_states_cross_modal = ( - outputs.hidden_states if return_dict else outputs[3] ) + pooler_output = outputs.pooler_output + hidden_states_txt, hidden_states_img, hidden_states_cross_modal = outputs.hidden_states text_embeds = hidden_states_txt[-1] image_embeds = hidden_states_img[-1] @@ -1960,10 +1917,6 @@ def forward( image_to_cross_loss = nn.functional.cross_entropy(logits_image_to_cross, labels) itc_loss = (text_to_image_loss + text_to_cross_loss + image_to_cross_loss) / 3.0 - if not return_dict: - output = (logits, text_embeds, image_embeds, cross_embeds) + outputs[3:] - return ((itc_loss,) + output) if itc_loss is not None else output - return BridgeTowerContrastiveOutput( loss=itc_loss, logits=logits, From 5b9a99e139a94caa4c68afbc1cf924bfc0291a8a Mon Sep 17 00:00:00 2001 From: qubvel Date: Wed, 9 Apr 2025 14:05:36 +0000 Subject: [PATCH 11/32] camembert --- .../models/camembert/modeling_camembert.py | 121 +++++------------- 1 file changed, 31 insertions(+), 90 deletions(-) diff --git a/src/transformers/models/camembert/modeling_camembert.py b/src/transformers/models/camembert/modeling_camembert.py index b69590ae21a5..0678a3cf7c26 100644 --- a/src/transformers/models/camembert/modeling_camembert.py +++ b/src/transformers/models/camembert/modeling_camembert.py @@ -16,7 +16,7 @@ """PyTorch CamemBERT model.""" import math -from typing import List, Optional, Tuple, Union +from typing import List, Optional, Tuple import torch import torch.utils.checkpoint @@ -46,6 +46,7 @@ add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, + can_return_tuple, get_torch_version, logging, replace_return_docstrings, @@ -602,6 +603,7 @@ def __init__(self, config): self.layer = nn.ModuleList([CamembertLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False + @can_return_tuple def forward( self, hidden_states: torch.Tensor, @@ -613,8 +615,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = False, output_hidden_states: Optional[bool] = False, - return_dict: Optional[bool] = True, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: + ) -> BaseModelOutputWithPastAndCrossAttentions: all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None @@ -667,18 +668,6 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - if not return_dict: - return tuple( - v - for v in [ - hidden_states, - next_decoder_cache, - all_hidden_states, - all_self_attentions, - all_cross_attentions, - ] - if v is not None - ) return BaseModelOutputWithPastAndCrossAttentions( last_hidden_state=hidden_states, past_key_values=next_decoder_cache, @@ -913,8 +902,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: r""" encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if @@ -939,7 +927,6 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if self.config.is_decoder: use_cache = use_cache if use_cache is not None else self.config.use_cache @@ -1034,7 +1021,7 @@ def forward( # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) - encoder_outputs = self.encoder( + encoder_outputs: BaseModelOutputWithPastAndCrossAttentions = self.encoder( embedding_output, attention_mask=extended_attention_mask, head_mask=head_mask, @@ -1044,14 +1031,10 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = encoder_outputs[0] + sequence_output = encoder_outputs.last_hidden_state pooled_output = self.pooler(sequence_output) if self.pooler is not None else None - if not return_dict: - return (sequence_output, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPoolingAndCrossAttentions( last_hidden_state=sequence_output, pooler_output=pooled_output, @@ -1091,6 +1074,7 @@ def get_output_embeddings(self): def set_output_embeddings(self, new_embeddings): self.lm_head.decoder = new_embeddings + @can_return_tuple @add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -1113,8 +1097,7 @@ def forward( labels: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]: + ) -> MaskedLMOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., @@ -1123,9 +1106,7 @@ def forward( kwargs (`Dict[str, any]`, *optional*, defaults to `{}`): Used to hide legacy arguments that have been deprecated. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.roberta( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1136,9 +1117,8 @@ def forward( encoder_attention_mask=encoder_attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state prediction_scores = self.lm_head(sequence_output) masked_lm_loss = None @@ -1148,10 +1128,6 @@ def forward( loss_fct = CrossEntropyLoss() masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) - if not return_dict: - output = (prediction_scores,) + outputs[2:] - return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output - return MaskedLMOutput( loss=masked_lm_loss, logits=prediction_scores, @@ -1180,6 +1156,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint="cardiffnlp/twitter-roberta-base-emotion", @@ -1199,17 +1176,14 @@ def forward( labels: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]: + ) -> SequenceClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.roberta( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1218,9 +1192,8 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state logits = self.classifier(sequence_output) loss = None @@ -1248,10 +1221,6 @@ def forward( loss_fct = BCEWithLogitsLoss() loss = loss_fct(logits, labels) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return SequenceClassifierOutput( loss=loss, logits=logits, @@ -1279,6 +1248,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward( CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length") ) @@ -1298,15 +1268,13 @@ def forward( inputs_embeds: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]: + ) -> MultipleChoiceModelOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above) """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None @@ -1319,7 +1287,7 @@ def forward( else None ) - outputs = self.roberta( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roberta( flat_input_ids, position_ids=flat_position_ids, token_type_ids=flat_token_type_ids, @@ -1328,9 +1296,8 @@ def forward( inputs_embeds=flat_inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - pooled_output = outputs[1] + pooled_output = outputs.pooler_output pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) @@ -1343,10 +1310,6 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(reshaped_logits, labels) - if not return_dict: - output = (reshaped_logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return MultipleChoiceModelOutput( loss=loss, logits=reshaped_logits, @@ -1378,6 +1341,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint="Jean-Baptiste/roberta-large-ner-english", @@ -1397,15 +1361,12 @@ def forward( labels: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]: + ) -> TokenClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.roberta( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1414,11 +1375,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] - + sequence_output = outputs.last_hidden_state sequence_output = self.dropout(sequence_output) logits = self.classifier(sequence_output) @@ -1429,10 +1388,6 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return TokenClassifierOutput( loss=loss, logits=logits, @@ -1460,6 +1415,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint="deepset/roberta-base-squad2", @@ -1480,8 +1436,7 @@ def forward( end_positions: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]: + ) -> QuestionAnsweringModelOutput: r""" start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for position (index) of the start of the labelled span for computing the token classification loss. @@ -1492,9 +1447,7 @@ def forward( Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.roberta( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1503,10 +1456,8 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) @@ -1530,10 +1481,6 @@ def forward( end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 - if not return_dict: - output = (start_logits, end_logits) + outputs[2:] - return ((total_loss,) + output) if total_loss is not None else output - return QuestionAnsweringModelOutput( loss=total_loss, start_logits=start_logits, @@ -1568,6 +1515,7 @@ def get_output_embeddings(self): def set_output_embeddings(self, new_embeddings): self.lm_head.decoder = new_embeddings + @can_return_tuple @add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC) def forward( @@ -1585,9 +1533,8 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, **kwargs, - ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]: + ) -> CausalLMOutputWithCrossAttentions: r""" encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if @@ -1631,11 +1578,10 @@ def forward( >>> prediction_logits = outputs.logits ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if labels is not None: use_cache = False - outputs = self.roberta( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1648,10 +1594,9 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state prediction_scores = self.lm_head(sequence_output) lm_loss = None @@ -1665,10 +1610,6 @@ def forward( **kwargs, ) - if not return_dict: - output = (prediction_scores,) + outputs[2:] - return ((lm_loss,) + output) if lm_loss is not None else output - return CausalLMOutputWithCrossAttentions( loss=lm_loss, logits=prediction_scores, From a926c5b53cf39cc23335ee6f324f5d0c4618006d Mon Sep 17 00:00:00 2001 From: qubvel Date: Wed, 9 Apr 2025 14:10:31 +0000 Subject: [PATCH 12/32] data2vec_text --- .../models/data2vec/modeling_data2vec_text.py | 120 +++++------------- 1 file changed, 32 insertions(+), 88 deletions(-) diff --git a/src/transformers/models/data2vec/modeling_data2vec_text.py b/src/transformers/models/data2vec/modeling_data2vec_text.py index 5f84eca754e8..321d56f97057 100644 --- a/src/transformers/models/data2vec/modeling_data2vec_text.py +++ b/src/transformers/models/data2vec/modeling_data2vec_text.py @@ -15,7 +15,7 @@ """PyTorch Data2VecText model.""" import math -from typing import List, Optional, Tuple, Union +from typing import List, Optional, Tuple import torch import torch.utils.checkpoint @@ -40,6 +40,7 @@ add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, + can_return_tuple, logging, replace_return_docstrings, ) @@ -479,6 +480,7 @@ def __init__(self, config): self.layer = nn.ModuleList([Data2VecTextLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False + @can_return_tuple def forward( self, hidden_states: torch.Tensor, @@ -490,8 +492,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = False, output_hidden_states: Optional[bool] = False, - return_dict: Optional[bool] = True, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: + ) -> BaseModelOutputWithPastAndCrossAttentions: all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None @@ -544,18 +545,6 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - if not return_dict: - return tuple( - v - for v in [ - hidden_states, - next_decoder_cache, - all_hidden_states, - all_self_attentions, - all_cross_attentions, - ] - if v is not None - ) return BaseModelOutputWithPastAndCrossAttentions( last_hidden_state=hidden_states, past_key_values=next_decoder_cache, @@ -726,6 +715,7 @@ class PreTrainedModel for layer, heads in heads_to_prune.items(): self.encoder.layer[layer].attention.prune_heads(heads) + @can_return_tuple @add_start_docstrings_to_model_forward(DATA2VECTEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -747,8 +737,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: r""" encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if @@ -773,7 +762,6 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if self.config.is_decoder: use_cache = use_cache if use_cache is not None else self.config.use_cache @@ -836,7 +824,7 @@ def forward( inputs_embeds=inputs_embeds, past_key_values_length=past_key_values_length, ) - encoder_outputs = self.encoder( + encoder_outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.encoder( embedding_output, attention_mask=extended_attention_mask, head_mask=head_mask, @@ -846,14 +834,10 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = encoder_outputs[0] + sequence_output = encoder_outputs.last_hidden_state pooled_output = self.pooler(sequence_output) if self.pooler is not None else None - if not return_dict: - return (sequence_output, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPoolingAndCrossAttentions( last_hidden_state=sequence_output, pooler_output=pooled_output, @@ -888,6 +872,7 @@ def get_output_embeddings(self): def set_output_embeddings(self, new_embeddings): self.lm_head.decoder = new_embeddings + @can_return_tuple @add_start_docstrings_to_model_forward(DATA2VECTEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC) def forward( @@ -905,9 +890,8 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, **kwargs, - ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]: + ) -> CausalLMOutputWithCrossAttentions: r""" encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if @@ -951,11 +935,10 @@ def forward( >>> prediction_logits = outputs.logits ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if labels is not None: use_cache = False - outputs = self.data2vec_text( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.data2vec_text( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -968,10 +951,9 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state prediction_scores = self.lm_head(sequence_output) lm_loss = None @@ -983,10 +965,6 @@ def forward( **kwargs, ) - if not return_dict: - output = (prediction_scores,) + outputs[2:] - return ((lm_loss,) + output) if lm_loss is not None else output - return CausalLMOutputWithCrossAttentions( loss=lm_loss, logits=prediction_scores, @@ -1030,6 +1008,7 @@ def get_output_embeddings(self): def set_output_embeddings(self, new_embeddings): self.lm_head.decoder = new_embeddings + @can_return_tuple @add_start_docstrings_to_model_forward(DATA2VECTEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -1050,8 +1029,7 @@ def forward( labels: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, MaskedLMOutput]: + ) -> MaskedLMOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., @@ -1060,9 +1038,7 @@ def forward( kwargs (`Dict[str, any]`, *optional*, defaults to *{}*): Used to hide legacy arguments that have been deprecated. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.data2vec_text( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.data2vec_text( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1073,9 +1049,8 @@ def forward( encoder_attention_mask=encoder_attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state prediction_scores = self.lm_head(sequence_output) masked_lm_loss = None @@ -1085,10 +1060,6 @@ def forward( labels = labels.to(prediction_scores.device) masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) - if not return_dict: - output = (prediction_scores,) + outputs[2:] - return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output - return MaskedLMOutput( loss=masked_lm_loss, logits=prediction_scores, @@ -1148,6 +1119,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(DATA2VECTEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -1165,17 +1137,14 @@ def forward( labels: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, SequenceClassifierOutput]: + ) -> SequenceClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.data2vec_text( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.data2vec_text( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1184,9 +1153,8 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state logits = self.classifier(sequence_output) loss = None @@ -1214,10 +1182,6 @@ def forward( loss_fct = BCEWithLogitsLoss() loss = loss_fct(logits, labels) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return SequenceClassifierOutput( loss=loss, logits=logits, @@ -1244,6 +1208,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward( DATA2VECTEXT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length") ) @@ -1263,15 +1228,13 @@ def forward( inputs_embeds: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, MultipleChoiceModelOutput]: + ) -> MultipleChoiceModelOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above) """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None @@ -1284,7 +1247,7 @@ def forward( else None ) - outputs = self.data2vec_text( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.data2vec_text( flat_input_ids, position_ids=flat_position_ids, token_type_ids=flat_token_type_ids, @@ -1293,9 +1256,8 @@ def forward( inputs_embeds=flat_inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - pooled_output = outputs[1] + pooled_output = outputs.pooler_output pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) @@ -1308,10 +1270,6 @@ def forward( labels = labels.to(reshaped_logits.device) loss = loss_fct(reshaped_logits, labels) - if not return_dict: - output = (reshaped_logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return MultipleChoiceModelOutput( loss=loss, logits=reshaped_logits, @@ -1342,6 +1300,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(DATA2VECTEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -1359,15 +1318,12 @@ def forward( labels: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, TokenClassifierOutput]: + ) -> TokenClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.data2vec_text( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.data2vec_text( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1376,10 +1332,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state sequence_output = self.dropout(sequence_output) logits = self.classifier(sequence_output) @@ -1391,10 +1346,6 @@ def forward( labels = labels.to(logits.device) loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return TokenClassifierOutput( loss=loss, logits=logits, @@ -1444,6 +1395,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(DATA2VECTEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -1462,8 +1414,7 @@ def forward( end_positions: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, QuestionAnsweringModelOutput]: + ) -> QuestionAnsweringModelOutput: r""" start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for position (index) of the start of the labelled span for computing the token classification loss. @@ -1474,9 +1425,7 @@ def forward( Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.data2vec_text( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.data2vec_text( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1485,10 +1434,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) @@ -1512,10 +1460,6 @@ def forward( end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 - if not return_dict: - output = (start_logits, end_logits) + outputs[2:] - return ((total_loss,) + output) if total_loss is not None else output - return QuestionAnsweringModelOutput( loss=total_loss, start_logits=start_logits, From d7447fd037e5909f7584ed60f7c1f9ca580f6be8 Mon Sep 17 00:00:00 2001 From: qubvel Date: Wed, 9 Apr 2025 14:23:00 +0000 Subject: [PATCH 13/32] electra (breaks fx) --- .../models/electra/modeling_electra.py | 172 ++++++------------ 1 file changed, 53 insertions(+), 119 deletions(-) diff --git a/src/transformers/models/electra/modeling_electra.py b/src/transformers/models/electra/modeling_electra.py index 7b73f022122d..c5a48b8c945e 100644 --- a/src/transformers/models/electra/modeling_electra.py +++ b/src/transformers/models/electra/modeling_electra.py @@ -17,7 +17,7 @@ import math import os from dataclasses import dataclass -from typing import List, Optional, Tuple, Union +from typing import List, Optional, Tuple import torch import torch.utils.checkpoint @@ -43,6 +43,7 @@ add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, + can_return_tuple, logging, replace_return_docstrings, ) @@ -536,6 +537,7 @@ def __init__(self, config): self.layer = nn.ModuleList([ElectraLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False + @can_return_tuple def forward( self, hidden_states: torch.Tensor, @@ -547,8 +549,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = False, output_hidden_states: Optional[bool] = False, - return_dict: Optional[bool] = True, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: + ) -> BaseModelOutputWithPastAndCrossAttentions: all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None @@ -601,18 +602,6 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - if not return_dict: - return tuple( - v - for v in [ - hidden_states, - next_decoder_cache, - all_hidden_states, - all_self_attentions, - all_cross_attentions, - ] - if v is not None - ) return BaseModelOutputWithPastAndCrossAttentions( last_hidden_state=hidden_states, past_key_values=next_decoder_cache, @@ -827,6 +816,7 @@ class PreTrainedModel for layer, heads in heads_to_prune.items(): self.encoder.layer[layer].attention.prune_heads(heads) + @can_return_tuple @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -847,13 +837,11 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithCrossAttentions]: + ) -> BaseModelOutputWithPastAndCrossAttentions: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -907,7 +895,7 @@ def forward( if hasattr(self, "embeddings_project"): hidden_states = self.embeddings_project(hidden_states) - hidden_states = self.encoder( + outputs: BaseModelOutputWithPastAndCrossAttentions = self.encoder( hidden_states, attention_mask=extended_attention_mask, head_mask=head_mask, @@ -917,10 +905,8 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - - return hidden_states + return outputs class ElectraClassificationHead(nn.Module): @@ -964,6 +950,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint="bhadresh-savani/electra-base-emotion", @@ -983,17 +970,15 @@ def forward( labels: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]: + ) -> SequenceClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - discriminator_hidden_states = self.electra( + discriminator_outputs: BaseModelOutputWithPastAndCrossAttentions = self.electra( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1002,10 +987,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = discriminator_hidden_states[0] + sequence_output = discriminator_outputs.last_hidden_state logits = self.classifier(sequence_output) loss = None @@ -1031,15 +1015,11 @@ def forward( loss_fct = BCEWithLogitsLoss() loss = loss_fct(logits, labels) - if not return_dict: - output = (logits,) + discriminator_hidden_states[1:] - return ((loss,) + output) if loss is not None else output - return SequenceClassifierOutput( loss=loss, logits=logits, - hidden_states=discriminator_hidden_states.hidden_states, - attentions=discriminator_hidden_states.attentions, + hidden_states=discriminator_outputs.hidden_states, + attentions=discriminator_outputs.attentions, ) @@ -1060,6 +1040,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=ElectraForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) def forward( @@ -1073,8 +1054,7 @@ def forward( labels: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], ElectraForPreTrainingOutput]: + ) -> ElectraForPreTrainingOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the ELECTRA loss. Input should be a sequence of tokens (see `input_ids` docstring) @@ -1108,9 +1088,7 @@ def forward( >>> predictions.squeeze().tolist() [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0] ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - discriminator_hidden_states = self.electra( + discriminator_outputs: BaseModelOutputWithPastAndCrossAttentions = self.electra( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1119,32 +1097,26 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - discriminator_sequence_output = discriminator_hidden_states[0] - - logits = self.discriminator_predictions(discriminator_sequence_output) + sequence_output = discriminator_outputs.last_hidden_state + logits = self.discriminator_predictions(sequence_output) loss = None if labels is not None: loss_fct = nn.BCEWithLogitsLoss() if attention_mask is not None: - active_loss = attention_mask.view(-1, discriminator_sequence_output.shape[1]) == 1 - active_logits = logits.view(-1, discriminator_sequence_output.shape[1])[active_loss] + active_loss = attention_mask.view(-1, sequence_output.shape[1]) == 1 + active_logits = logits.view(-1, sequence_output.shape[1])[active_loss] active_labels = labels[active_loss] loss = loss_fct(active_logits, active_labels.float()) else: - loss = loss_fct(logits.view(-1, discriminator_sequence_output.shape[1]), labels.float()) - - if not return_dict: - output = (logits,) + discriminator_hidden_states[1:] - return ((loss,) + output) if loss is not None else output + loss = loss_fct(logits.view(-1, sequence_output.shape[1]), labels.float()) return ElectraForPreTrainingOutput( loss=loss, logits=logits, - hidden_states=discriminator_hidden_states.hidden_states, - attentions=discriminator_hidden_states.attentions, + hidden_states=discriminator_outputs.hidden_states, + attentions=discriminator_outputs.attentions, ) @@ -1176,6 +1148,7 @@ def get_output_embeddings(self): def set_output_embeddings(self, word_embeddings): self.generator_lm_head = word_embeddings + @can_return_tuple @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint="google/electra-small-generator", @@ -1196,17 +1169,14 @@ def forward( labels: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]: + ) -> MaskedLMOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - generator_hidden_states = self.electra( + generator_outputs: BaseModelOutputWithPastAndCrossAttentions = self.electra( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1215,11 +1185,10 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - generator_sequence_output = generator_hidden_states[0] + sequence_output = generator_outputs.last_hidden_state - prediction_scores = self.generator_predictions(generator_sequence_output) + prediction_scores = self.generator_predictions(sequence_output) prediction_scores = self.generator_lm_head(prediction_scores) loss = None @@ -1228,15 +1197,11 @@ def forward( loss_fct = nn.CrossEntropyLoss() # -100 index = padding token loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) - if not return_dict: - output = (prediction_scores,) + generator_hidden_states[1:] - return ((loss,) + output) if loss is not None else output - return MaskedLMOutput( loss=loss, logits=prediction_scores, - hidden_states=generator_hidden_states.hidden_states, - attentions=generator_hidden_states.attentions, + hidden_states=generator_outputs.hidden_states, + attentions=generator_outputs.attentions, ) @@ -1262,6 +1227,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint="bhadresh-savani/electra-base-discriminator-finetuned-conll03-english", @@ -1281,15 +1247,12 @@ def forward( labels: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]: + ) -> TokenClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - discriminator_hidden_states = self.electra( + discriminator_outputs: BaseModelOutputWithPastAndCrossAttentions = self.electra( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1298,10 +1261,8 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - discriminator_sequence_output = discriminator_hidden_states[0] - + discriminator_sequence_output = discriminator_outputs.last_hidden_state discriminator_sequence_output = self.dropout(discriminator_sequence_output) logits = self.classifier(discriminator_sequence_output) @@ -1310,15 +1271,11 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if not return_dict: - output = (logits,) + discriminator_hidden_states[1:] - return ((loss,) + output) if loss is not None else output - return TokenClassifierOutput( loss=loss, logits=logits, - hidden_states=discriminator_hidden_states.hidden_states, - attentions=discriminator_hidden_states.attentions, + hidden_states=discriminator_outputs.hidden_states, + attentions=discriminator_outputs.attentions, ) @@ -1343,6 +1300,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint="bhadresh-savani/electra-base-squad2", @@ -1365,8 +1323,7 @@ def forward( end_positions: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]: + ) -> QuestionAnsweringModelOutput: r""" start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for position (index) of the start of the labelled span for computing the token classification loss. @@ -1377,9 +1334,8 @@ def forward( Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - discriminator_hidden_states = self.electra( + discriminator_outputs: BaseModelOutputWithPastAndCrossAttentions = self.electra( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1390,8 +1346,7 @@ def forward( output_hidden_states=output_hidden_states, ) - sequence_output = discriminator_hidden_states[0] - + sequence_output = discriminator_outputs.last_hidden_state logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) start_logits = start_logits.squeeze(-1).contiguous() @@ -1414,19 +1369,12 @@ def forward( end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 - if not return_dict: - output = ( - start_logits, - end_logits, - ) + discriminator_hidden_states[1:] - return ((total_loss,) + output) if total_loss is not None else output - return QuestionAnsweringModelOutput( loss=total_loss, start_logits=start_logits, end_logits=end_logits, - hidden_states=discriminator_hidden_states.hidden_states, - attentions=discriminator_hidden_states.attentions, + hidden_states=discriminator_outputs.hidden_states, + attentions=discriminator_outputs.attentions, ) @@ -1448,6 +1396,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -1465,15 +1414,13 @@ def forward( labels: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]: + ) -> MultipleChoiceModelOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above) """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None @@ -1486,7 +1433,7 @@ def forward( else None ) - discriminator_hidden_states = self.electra( + discriminator_outputs: BaseModelOutputWithPastAndCrossAttentions = self.electra( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1495,11 +1442,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = discriminator_hidden_states[0] - + sequence_output = discriminator_outputs.last_hidden_state pooled_output = self.sequence_summary(sequence_output) logits = self.classifier(pooled_output) reshaped_logits = logits.view(-1, num_choices) @@ -1509,15 +1454,11 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(reshaped_logits, labels) - if not return_dict: - output = (reshaped_logits,) + discriminator_hidden_states[1:] - return ((loss,) + output) if loss is not None else output - return MultipleChoiceModelOutput( loss=loss, logits=reshaped_logits, - hidden_states=discriminator_hidden_states.hidden_states, - attentions=discriminator_hidden_states.attentions, + hidden_states=discriminator_outputs.hidden_states, + attentions=discriminator_outputs.attentions, ) @@ -1545,6 +1486,7 @@ def get_output_embeddings(self): def set_output_embeddings(self, new_embeddings): self.generator_lm_head = new_embeddings + @can_return_tuple @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC) def forward( @@ -1562,9 +1504,8 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, **kwargs, - ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]: + ) -> CausalLMOutputWithCrossAttentions: r""" encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if @@ -1608,11 +1549,10 @@ def forward( >>> prediction_logits = outputs.logits ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if labels is not None: use_cache = False - outputs = self.electra( + outputs: BaseModelOutputWithPastAndCrossAttentions = self.electra( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1625,10 +1565,8 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state prediction_scores = self.generator_lm_head(self.generator_predictions(sequence_output)) lm_loss = None @@ -1640,10 +1578,6 @@ def forward( **kwargs, ) - if not return_dict: - output = (prediction_scores,) + outputs[1:] - return ((lm_loss,) + output) if lm_loss is not None else output - return CausalLMOutputWithCrossAttentions( loss=lm_loss, logits=prediction_scores, From dc845e69906abece65cce6e3b81219c130641936 Mon Sep 17 00:00:00 2001 From: qubvel Date: Wed, 9 Apr 2025 14:29:52 +0000 Subject: [PATCH 14/32] ernie --- .../models/ernie/modeling_ernie.py | 147 +++++------------- 1 file changed, 40 insertions(+), 107 deletions(-) diff --git a/src/transformers/models/ernie/modeling_ernie.py b/src/transformers/models/ernie/modeling_ernie.py index 559078b1ef1a..b69482391a46 100644 --- a/src/transformers/models/ernie/modeling_ernie.py +++ b/src/transformers/models/ernie/modeling_ernie.py @@ -17,7 +17,7 @@ import math import warnings from dataclasses import dataclass -from typing import List, Optional, Tuple, Union +from typing import List, Optional, Tuple import torch import torch.utils.checkpoint @@ -44,6 +44,7 @@ add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, + can_return_tuple, logging, replace_return_docstrings, ) @@ -465,6 +466,7 @@ def __init__(self, config): self.layer = nn.ModuleList([ErnieLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False + @can_return_tuple def forward( self, hidden_states: torch.Tensor, @@ -476,8 +478,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = False, output_hidden_states: Optional[bool] = False, - return_dict: Optional[bool] = True, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: + ) -> BaseModelOutputWithPastAndCrossAttentions: all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None @@ -530,18 +531,6 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - if not return_dict: - return tuple( - v - for v in [ - hidden_states, - next_decoder_cache, - all_hidden_states, - all_self_attentions, - all_cross_attentions, - ] - if v is not None - ) return BaseModelOutputWithPastAndCrossAttentions( last_hidden_state=hidden_states, past_key_values=next_decoder_cache, @@ -824,6 +813,7 @@ class PreTrainedModel for layer, heads in heads_to_prune.items(): self.encoder.layer[layer].attention.prune_heads(heads) + @can_return_tuple @add_start_docstrings_to_model_forward(ERNIE_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -845,8 +835,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: r""" encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if @@ -871,7 +860,6 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if self.config.is_decoder: use_cache = use_cache if use_cache is not None else self.config.use_cache @@ -935,7 +923,7 @@ def forward( inputs_embeds=inputs_embeds, past_key_values_length=past_key_values_length, ) - encoder_outputs = self.encoder( + encoder_outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.encoder( embedding_output, attention_mask=extended_attention_mask, head_mask=head_mask, @@ -945,14 +933,10 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = encoder_outputs[0] + sequence_output = encoder_outputs.last_hidden_state pooled_output = self.pooler(sequence_output) if self.pooler is not None else None - if not return_dict: - return (sequence_output, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPoolingAndCrossAttentions( last_hidden_state=sequence_output, pooler_output=pooled_output, @@ -992,6 +976,7 @@ def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings self.cls.predictions.bias = new_embeddings.bias + @can_return_tuple @add_start_docstrings_to_model_forward(ERNIE_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=ErnieForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) def forward( @@ -1007,8 +992,7 @@ def forward( next_sentence_label: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], ErnieForPreTrainingOutput]: + ) -> ErnieForPreTrainingOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., @@ -1041,9 +1025,8 @@ def forward( >>> seq_relationship_logits = outputs.seq_relationship_logits ``` """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.ernie( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.ernie( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1053,10 +1036,10 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output, pooled_output = outputs[:2] + sequence_output = outputs.last_hidden_state + pooled_output = outputs.pooler_output prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output) total_loss = None @@ -1066,10 +1049,6 @@ def forward( next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) total_loss = masked_lm_loss + next_sentence_loss - if not return_dict: - output = (prediction_scores, seq_relationship_score) + outputs[2:] - return ((total_loss,) + output) if total_loss is not None else output - return ErnieForPreTrainingOutput( loss=total_loss, prediction_logits=prediction_scores, @@ -1107,6 +1086,7 @@ def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings self.cls.predictions.bias = new_embeddings.bias + @can_return_tuple @add_start_docstrings_to_model_forward(ERNIE_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -1129,9 +1109,8 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, **kwargs, - ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]: + ) -> CausalLMOutputWithCrossAttentions: r""" encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if @@ -1156,11 +1135,10 @@ def forward( If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see `past_key_values`). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if labels is not None: use_cache = False - outputs = self.ernie( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.ernie( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1174,10 +1152,9 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state prediction_scores = self.cls(sequence_output) lm_loss = None @@ -1189,10 +1166,6 @@ def forward( **kwargs, ) - if not return_dict: - output = (prediction_scores,) + outputs[2:] - return ((lm_loss,) + output) if lm_loss is not None else output - return CausalLMOutputWithCrossAttentions( loss=lm_loss, logits=prediction_scores, @@ -1241,6 +1214,7 @@ def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings self.cls.predictions.bias = new_embeddings.bias + @can_return_tuple @add_start_docstrings_to_model_forward(ERNIE_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -1263,8 +1237,7 @@ def forward( labels: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]: + ) -> MaskedLMOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., @@ -1272,9 +1245,7 @@ def forward( loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.ernie( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.ernie( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1286,10 +1257,9 @@ def forward( encoder_attention_mask=encoder_attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state prediction_scores = self.cls(sequence_output) masked_lm_loss = None @@ -1297,10 +1267,6 @@ def forward( loss_fct = CrossEntropyLoss() # -100 index = padding token masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) - if not return_dict: - output = (prediction_scores,) + outputs[2:] - return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output - return MaskedLMOutput( loss=masked_lm_loss, logits=prediction_scores, @@ -1349,6 +1315,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(ERNIE_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=NextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC) def forward( @@ -1363,9 +1330,8 @@ def forward( labels: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, **kwargs, - ) -> Union[Tuple[torch.Tensor], NextSentencePredictorOutput]: + ) -> NextSentencePredictorOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair @@ -1403,9 +1369,7 @@ def forward( ) labels = kwargs.pop("next_sentence_label") - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.ernie( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.ernie( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1415,11 +1379,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - pooled_output = outputs[1] - + pooled_output = outputs.pooler_output seq_relationship_scores = self.cls(pooled_output) next_sentence_loss = None @@ -1427,10 +1389,6 @@ def forward( loss_fct = CrossEntropyLoss() next_sentence_loss = loss_fct(seq_relationship_scores.view(-1, 2), labels.view(-1)) - if not return_dict: - output = (seq_relationship_scores,) + outputs[2:] - return ((next_sentence_loss,) + output) if next_sentence_loss is not None else output - return NextSentencePredictorOutput( loss=next_sentence_loss, logits=seq_relationship_scores, @@ -1463,6 +1421,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(ERNIE_INPUTS_DOCSTRING.format("batch_size, sequence_length")) def forward( self, @@ -1476,17 +1435,15 @@ def forward( labels: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]: + ) -> SequenceClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.ernie( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.ernie( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1496,11 +1453,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - pooled_output = outputs[1] - + pooled_output = outputs.pooler_output pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) @@ -1526,9 +1481,6 @@ def forward( elif self.config.problem_type == "multi_label_classification": loss_fct = BCEWithLogitsLoss() loss = loss_fct(logits, labels) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output return SequenceClassifierOutput( loss=loss, @@ -1560,6 +1512,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(ERNIE_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -1578,15 +1531,13 @@ def forward( labels: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]: + ) -> MultipleChoiceModelOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above) """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None @@ -1599,7 +1550,7 @@ def forward( else None ) - outputs = self.ernie( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.ernie( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1609,10 +1560,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - pooled_output = outputs[1] + pooled_output = outputs.pooler_output pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) @@ -1623,10 +1573,6 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(reshaped_logits, labels) - if not return_dict: - output = (reshaped_logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return MultipleChoiceModelOutput( loss=loss, logits=reshaped_logits, @@ -1658,6 +1604,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(ERNIE_INPUTS_DOCSTRING.format("batch_size, sequence_length")) def forward( self, @@ -1671,15 +1618,13 @@ def forward( labels: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]: + ) -> TokenClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.ernie( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.ernie( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1689,7 +1634,6 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) sequence_output = outputs[0] @@ -1702,10 +1646,6 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return TokenClassifierOutput( loss=loss, logits=logits, @@ -1733,6 +1673,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(ERNIE_INPUTS_DOCSTRING.format("batch_size, sequence_length")) def forward( self, @@ -1747,8 +1688,7 @@ def forward( end_positions: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]: + ) -> QuestionAnsweringModelOutput: r""" start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for position (index) of the start of the labelled span for computing the token classification loss. @@ -1759,9 +1699,8 @@ def forward( Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.ernie( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.ernie( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1771,11 +1710,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] - + sequence_output = outputs.last_hidden_state logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) start_logits = start_logits.squeeze(-1).contiguous() @@ -1798,10 +1735,6 @@ def forward( end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 - if not return_dict: - output = (start_logits, end_logits) + outputs[2:] - return ((total_loss,) + output) if total_loss is not None else output - return QuestionAnsweringModelOutput( loss=total_loss, start_logits=start_logits, From 07fe220cd42148392a57aea2133707e923b2e2ab Mon Sep 17 00:00:00 2001 From: qubvel Date: Wed, 9 Apr 2025 14:34:48 +0000 Subject: [PATCH 15/32] layoutlm --- .../models/layoutlm/modeling_layoutlm.py | 98 ++++++------------- 1 file changed, 30 insertions(+), 68 deletions(-) diff --git a/src/transformers/models/layoutlm/modeling_layoutlm.py b/src/transformers/models/layoutlm/modeling_layoutlm.py index 8c31521a3f6d..5d48981925c2 100644 --- a/src/transformers/models/layoutlm/modeling_layoutlm.py +++ b/src/transformers/models/layoutlm/modeling_layoutlm.py @@ -15,7 +15,7 @@ """PyTorch LayoutLM model.""" import math -from typing import Optional, Tuple, Union +from typing import Optional, Tuple import torch import torch.utils.checkpoint @@ -33,7 +33,13 @@ ) from ...modeling_utils import PreTrainedModel from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer -from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings +from ...utils import ( + add_start_docstrings, + add_start_docstrings_to_model_forward, + can_return_tuple, + logging, + replace_return_docstrings, +) from .configuration_layoutlm import LayoutLMConfig @@ -455,6 +461,7 @@ def __init__(self, config): self.layer = nn.ModuleList([LayoutLMLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False + @can_return_tuple def forward( self, hidden_states: torch.Tensor, @@ -466,8 +473,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = False, output_hidden_states: Optional[bool] = False, - return_dict: Optional[bool] = True, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: + ) -> BaseModelOutputWithPastAndCrossAttentions: all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None @@ -520,18 +526,6 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - if not return_dict: - return tuple( - v - for v in [ - hidden_states, - next_decoder_cache, - all_hidden_states, - all_self_attentions, - all_cross_attentions, - ] - if v is not None - ) return BaseModelOutputWithPastAndCrossAttentions( last_hidden_state=hidden_states, past_key_values=next_decoder_cache, @@ -731,6 +725,7 @@ class PreTrainedModel for layer, heads in heads_to_prune.items(): self.encoder.layer[layer].attention.prune_heads(heads) + @can_return_tuple @add_start_docstrings_to_model_forward(LAYOUTLM_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=BaseModelOutputWithPoolingAndCrossAttentions, config_class=_CONFIG_FOR_DOC) def forward( @@ -746,8 +741,7 @@ def forward( encoder_attention_mask: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPoolingAndCrossAttentions]: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: r""" Returns: @@ -786,7 +780,6 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -830,20 +823,16 @@ def forward( token_type_ids=token_type_ids, inputs_embeds=inputs_embeds, ) - encoder_outputs = self.encoder( + encoder_outputs: BaseModelOutputWithPastAndCrossAttentions = self.encoder( embedding_output, extended_attention_mask, head_mask=head_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = encoder_outputs[0] + sequence_output = encoder_outputs.last_hidden_state pooled_output = self.pooler(sequence_output) - if not return_dict: - return (sequence_output, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPoolingAndCrossAttentions( last_hidden_state=sequence_output, pooler_output=pooled_output, @@ -876,6 +865,7 @@ def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings self.cls.predictions.bias = new_embeddings.bias + @can_return_tuple @add_start_docstrings_to_model_forward(LAYOUTLM_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC) def forward( @@ -892,8 +882,7 @@ def forward( encoder_attention_mask: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, MaskedLMOutput]: + ) -> MaskedLMOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., @@ -939,9 +928,8 @@ def forward( >>> loss = outputs.loss ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.layoutlm( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.layoutlm( input_ids, bbox, attention_mask=attention_mask, @@ -953,10 +941,9 @@ def forward( encoder_attention_mask=encoder_attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state prediction_scores = self.cls(sequence_output) masked_lm_loss = None @@ -967,10 +954,6 @@ def forward( labels.view(-1), ) - if not return_dict: - output = (prediction_scores,) + outputs[2:] - return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output - return MaskedLMOutput( loss=masked_lm_loss, logits=prediction_scores, @@ -1000,6 +983,7 @@ def __init__(self, config): def get_input_embeddings(self): return self.layoutlm.embeddings.word_embeddings + @can_return_tuple @add_start_docstrings_to_model_forward(LAYOUTLM_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=SequenceClassifierOutput, config_class=_CONFIG_FOR_DOC) def forward( @@ -1014,8 +998,7 @@ def forward( labels: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, SequenceClassifierOutput]: + ) -> SequenceClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., @@ -1061,9 +1044,8 @@ def forward( >>> loss = outputs.loss >>> logits = outputs.logits ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.layoutlm( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.layoutlm( input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, @@ -1073,11 +1055,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - pooled_output = outputs[1] - + pooled_output = outputs.pooler_output pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) @@ -1103,9 +1083,6 @@ def forward( elif self.config.problem_type == "multi_label_classification": loss_fct = BCEWithLogitsLoss() loss = loss_fct(logits, labels) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output return SequenceClassifierOutput( loss=loss, @@ -1137,6 +1114,7 @@ def __init__(self, config): def get_input_embeddings(self): return self.layoutlm.embeddings.word_embeddings + @can_return_tuple @add_start_docstrings_to_model_forward(LAYOUTLM_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=TokenClassifierOutput, config_class=_CONFIG_FOR_DOC) def forward( @@ -1151,8 +1129,7 @@ def forward( labels: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, TokenClassifierOutput]: + ) -> TokenClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. @@ -1196,9 +1173,8 @@ def forward( >>> loss = outputs.loss >>> logits = outputs.logits ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.layoutlm( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.layoutlm( input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, @@ -1208,11 +1184,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] - + sequence_output = outputs.last_hidden_state sequence_output = self.dropout(sequence_output) logits = self.classifier(sequence_output) @@ -1221,10 +1195,6 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return TokenClassifierOutput( loss=loss, logits=logits, @@ -1255,6 +1225,7 @@ def __init__(self, config, has_visual_segment_embedding=True): def get_input_embeddings(self): return self.layoutlm.embeddings.word_embeddings + @can_return_tuple @replace_return_docstrings(output_type=QuestionAnsweringModelOutput, config_class=_CONFIG_FOR_DOC) def forward( self, @@ -1269,8 +1240,7 @@ def forward( end_positions: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, QuestionAnsweringModelOutput]: + ) -> QuestionAnsweringModelOutput: r""" start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for position (index) of the start of the labelled span for computing the token classification loss. @@ -1325,9 +1295,7 @@ def forward( M. Hamann P. Harper, P. Martinez ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.layoutlm( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.layoutlm( input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, @@ -1337,11 +1305,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] - + sequence_output = outputs.last_hidden_state logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) start_logits = start_logits.squeeze(-1).contiguous() @@ -1364,10 +1330,6 @@ def forward( end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 - if not return_dict: - output = (start_logits, end_logits) + outputs[2:] - return ((total_loss,) + output) if total_loss is not None else output - return QuestionAnsweringModelOutput( loss=total_loss, start_logits=start_logits, From 767cc17d019bc5e9a7d8a840f4fee228a0eae8a2 Mon Sep 17 00:00:00 2001 From: qubvel Date: Wed, 9 Apr 2025 14:41:47 +0000 Subject: [PATCH 16/32] markuplm --- .../models/markuplm/modeling_markuplm.py | 73 +++++-------------- 1 file changed, 19 insertions(+), 54 deletions(-) diff --git a/src/transformers/models/markuplm/modeling_markuplm.py b/src/transformers/models/markuplm/modeling_markuplm.py index f47483d9d861..137704a33d44 100755 --- a/src/transformers/models/markuplm/modeling_markuplm.py +++ b/src/transformers/models/markuplm/modeling_markuplm.py @@ -43,7 +43,7 @@ find_pruneable_heads_and_indices, prune_linear_layer, ) -from ...utils import logging +from ...utils import can_return_tuple, logging from .configuration_markuplm import MarkupLMConfig @@ -620,6 +620,7 @@ def __init__(self, config): self.layer = nn.ModuleList([MarkupLMLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False + @can_return_tuple def forward( self, hidden_states: torch.Tensor, @@ -631,8 +632,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = False, output_hidden_states: Optional[bool] = False, - return_dict: Optional[bool] = True, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: + ) -> BaseModelOutputWithPastAndCrossAttentions: all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None @@ -685,18 +685,6 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - if not return_dict: - return tuple( - v - for v in [ - hidden_states, - next_decoder_cache, - all_hidden_states, - all_self_attentions, - all_cross_attentions, - ] - if v is not None - ) return BaseModelOutputWithPastAndCrossAttentions( last_hidden_state=hidden_states, past_key_values=next_decoder_cache, @@ -833,6 +821,7 @@ class PreTrainedModel for layer, heads in heads_to_prune.items(): self.encoder.layer[layer].attention.prune_heads(heads) + @can_return_tuple @add_start_docstrings_to_model_forward(MARKUPLM_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=BaseModelOutputWithPoolingAndCrossAttentions, config_class=_CONFIG_FOR_DOC) def forward( @@ -847,8 +836,7 @@ def forward( inputs_embeds: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPoolingAndCrossAttentions]: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: r""" Returns: @@ -873,7 +861,6 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -915,21 +902,17 @@ def forward( token_type_ids=token_type_ids, inputs_embeds=inputs_embeds, ) - encoder_outputs = self.encoder( + encoder_outputs: BaseModelOutputWithPastAndCrossAttentions = self.encoder( embedding_output, extended_attention_mask, head_mask=head_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = encoder_outputs[0] + sequence_output = encoder_outputs.last_hidden_state pooled_output = self.pooler(sequence_output) if self.pooler is not None else None - if not return_dict: - return (sequence_output, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPoolingAndCrossAttentions( last_hidden_state=sequence_output, pooler_output=pooled_output, @@ -967,6 +950,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(MARKUPLM_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=QuestionAnsweringModelOutput, config_class=_CONFIG_FOR_DOC) def forward( @@ -983,8 +967,7 @@ def forward( end_positions: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]: + ) -> QuestionAnsweringModelOutput: r""" start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for position (index) of the start of the labelled span for computing the token classification loss. @@ -1021,9 +1004,8 @@ def forward( >>> processor.decode(predict_answer_tokens).strip() 'Niels' ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.markuplm( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.markuplm( input_ids, xpath_tags_seq=xpath_tags_seq, xpath_subs_seq=xpath_subs_seq, @@ -1034,10 +1016,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) @@ -1061,10 +1042,6 @@ def forward( end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 - if not return_dict: - output = (start_logits, end_logits) + outputs[2:] - return ((total_loss,) + output) if total_loss is not None else output - return QuestionAnsweringModelOutput( loss=total_loss, start_logits=start_logits, @@ -1091,6 +1068,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(MARKUPLM_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC) def forward( @@ -1106,8 +1084,7 @@ def forward( labels: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]: + ) -> TokenClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. @@ -1135,9 +1112,8 @@ def forward( >>> loss = outputs.loss >>> logits = outputs.logits ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.markuplm( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.markuplm( input_ids, xpath_tags_seq=xpath_tags_seq, xpath_subs_seq=xpath_subs_seq, @@ -1148,10 +1124,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state prediction_scores = self.classifier(sequence_output) # (batch_size, seq_length, node_type_size) loss = None @@ -1162,10 +1137,6 @@ def forward( labels.view(-1), ) - if not return_dict: - output = (prediction_scores,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return TokenClassifierOutput( loss=loss, logits=prediction_scores, @@ -1198,6 +1169,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(MARKUPLM_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=SequenceClassifierOutput, config_class=_CONFIG_FOR_DOC) def forward( @@ -1213,8 +1185,7 @@ def forward( labels: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]: + ) -> SequenceClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., @@ -1241,9 +1212,8 @@ def forward( >>> loss = outputs.loss >>> logits = outputs.logits ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.markuplm( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.markuplm( input_ids, xpath_tags_seq=xpath_tags_seq, xpath_subs_seq=xpath_subs_seq, @@ -1254,11 +1224,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - pooled_output = outputs[1] - + pooled_output = outputs.pooler_output pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) @@ -1284,9 +1252,6 @@ def forward( elif self.config.problem_type == "multi_label_classification": loss_fct = BCEWithLogitsLoss() loss = loss_fct(logits, labels) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output return SequenceClassifierOutput( loss=loss, From 5875ad2a5bcd31e5fad018a03d07f782baae8fee Mon Sep 17 00:00:00 2001 From: qubvel Date: Wed, 9 Apr 2025 14:46:04 +0000 Subject: [PATCH 17/32] mobilebert (breaks fx) --- .../models/mobilebert/modeling_mobilebert.py | 128 +++++------------- 1 file changed, 37 insertions(+), 91 deletions(-) diff --git a/src/transformers/models/mobilebert/modeling_mobilebert.py b/src/transformers/models/mobilebert/modeling_mobilebert.py index b1d269055e65..ff0f5f43797c 100644 --- a/src/transformers/models/mobilebert/modeling_mobilebert.py +++ b/src/transformers/models/mobilebert/modeling_mobilebert.py @@ -24,7 +24,7 @@ import os import warnings from dataclasses import dataclass -from typing import Optional, Tuple, Union +from typing import Optional, Tuple import torch from torch import nn @@ -48,6 +48,7 @@ add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, + can_return_tuple, logging, replace_return_docstrings, ) @@ -562,6 +563,7 @@ def __init__(self, config): super().__init__() self.layer = nn.ModuleList([MobileBertLayer(config) for _ in range(config.num_hidden_layers)]) + @can_return_tuple def forward( self, hidden_states: torch.Tensor, @@ -569,8 +571,7 @@ def forward( head_mask: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = False, output_hidden_states: Optional[bool] = False, - return_dict: Optional[bool] = True, - ) -> Union[Tuple, BaseModelOutput]: + ) -> BaseModelOutput: all_hidden_states = () if output_hidden_states else None all_attentions = () if output_attentions else None for i, layer_module in enumerate(self.layer): @@ -592,8 +593,6 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - if not return_dict: - return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None) return BaseModelOutput( last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions ) @@ -842,6 +841,7 @@ class PreTrainedModel for layer, heads in heads_to_prune.items(): self.encoder.layer[layer].attention.prune_heads(heads) + @can_return_tuple @add_start_docstrings_to_model_forward(MOBILEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -858,13 +858,11 @@ def forward( inputs_embeds: Optional[torch.FloatTensor] = None, output_hidden_states: Optional[bool] = None, output_attentions: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPooling]: + ) -> BaseModelOutputWithPooling: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -897,20 +895,16 @@ def forward( embedding_output = self.embeddings( input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds ) - encoder_outputs = self.encoder( + encoder_outputs: BaseModelOutput = self.encoder( embedding_output, attention_mask=extended_attention_mask, head_mask=head_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = encoder_outputs[0] + sequence_output = encoder_outputs.last_hidden_state pooled_output = self.pooler(sequence_output) if self.pooler is not None else None - if not return_dict: - return (sequence_output, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPooling( last_hidden_state=sequence_output, pooler_output=pooled_output, @@ -952,6 +946,7 @@ def resize_token_embeddings(self, new_num_tokens: Optional[int] = None) -> nn.Em return super().resize_token_embeddings(new_num_tokens=new_num_tokens) + @can_return_tuple @add_start_docstrings_to_model_forward(MOBILEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=MobileBertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) def forward( @@ -966,8 +961,7 @@ def forward( next_sentence_label: Optional[torch.LongTensor] = None, output_attentions: Optional[torch.FloatTensor] = None, output_hidden_states: Optional[torch.FloatTensor] = None, - return_dict: Optional[torch.FloatTensor] = None, - ) -> Union[Tuple, MobileBertForPreTrainingOutput]: + ) -> MobileBertForPreTrainingOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., @@ -998,9 +992,8 @@ def forward( >>> prediction_logits = outputs.prediction_logits >>> seq_relationship_logits = outputs.seq_relationship_logits ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.mobilebert( + outputs: BaseModelOutputWithPooling = self.mobilebert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1009,9 +1002,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output, pooled_output = outputs[:2] + sequence_output = outputs.last_hidden_state + pooled_output = outputs.pooler_output prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output) total_loss = None @@ -1021,10 +1014,6 @@ def forward( next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) total_loss = masked_lm_loss + next_sentence_loss - if not return_dict: - output = (prediction_scores, seq_relationship_score) + outputs[2:] - return ((total_loss,) + output) if total_loss is not None else output - return MobileBertForPreTrainingOutput( loss=total_loss, prediction_logits=prediction_scores, @@ -1061,6 +1050,7 @@ def resize_token_embeddings(self, new_num_tokens: Optional[int] = None) -> nn.Em ) return super().resize_token_embeddings(new_num_tokens=new_num_tokens) + @can_return_tuple @add_start_docstrings_to_model_forward(MOBILEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -1080,17 +1070,15 @@ def forward( labels: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, MaskedLMOutput]: + ) -> MaskedLMOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.mobilebert( + outputs: BaseModelOutputWithPooling = self.mobilebert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1099,10 +1087,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state prediction_scores = self.cls(sequence_output) masked_lm_loss = None @@ -1110,10 +1097,6 @@ def forward( loss_fct = CrossEntropyLoss() # -100 index = padding token masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) - if not return_dict: - output = (prediction_scores,) + outputs[2:] - return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output - return MaskedLMOutput( loss=masked_lm_loss, logits=prediction_scores, @@ -1146,6 +1129,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(MOBILEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=NextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC) def forward( @@ -1159,9 +1143,8 @@ def forward( labels: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, **kwargs, - ) -> Union[Tuple, NextSentencePredictorOutput]: + ) -> NextSentencePredictorOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair @@ -1198,9 +1181,7 @@ def forward( ) labels = kwargs.pop("next_sentence_label") - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.mobilebert( + outputs: BaseModelOutputWithPooling = self.mobilebert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1209,10 +1190,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - pooled_output = outputs[1] + pooled_output = outputs.pooler_output seq_relationship_score = self.cls(pooled_output) next_sentence_loss = None @@ -1220,10 +1200,6 @@ def forward( loss_fct = CrossEntropyLoss() next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), labels.view(-1)) - if not return_dict: - output = (seq_relationship_score,) + outputs[2:] - return ((next_sentence_loss,) + output) if next_sentence_loss is not None else output - return NextSentencePredictorOutput( loss=next_sentence_loss, logits=seq_relationship_score, @@ -1256,6 +1232,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(MOBILEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION, @@ -1275,17 +1252,14 @@ def forward( labels: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]: + ) -> SequenceClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.mobilebert( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.mobilebert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1294,11 +1268,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - pooled_output = outputs[1] - + pooled_output = outputs.pooler_output pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) @@ -1324,9 +1296,6 @@ def forward( elif self.config.problem_type == "multi_label_classification": loss_fct = BCEWithLogitsLoss() loss = loss_fct(logits, labels) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output return SequenceClassifierOutput( loss=loss, @@ -1355,6 +1324,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(MOBILEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_QA, @@ -1377,8 +1347,7 @@ def forward( end_positions: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]: + ) -> QuestionAnsweringModelOutput: r""" start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for position (index) of the start of the labelled span for computing the token classification loss. @@ -1389,9 +1358,7 @@ def forward( Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.mobilebert( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.mobilebert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1400,11 +1367,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] - + sequence_output = outputs.last_hidden_state logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) start_logits = start_logits.squeeze(-1).contiguous() @@ -1427,10 +1392,6 @@ def forward( end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 - if not return_dict: - output = (start_logits, end_logits) + outputs[2:] - return ((total_loss,) + output) if total_loss is not None else output - return QuestionAnsweringModelOutput( loss=total_loss, start_logits=start_logits, @@ -1462,6 +1423,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward( MOBILEBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length") ) @@ -1481,15 +1443,13 @@ def forward( labels: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]: + ) -> MultipleChoiceModelOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above) """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None @@ -1502,7 +1462,7 @@ def forward( else None ) - outputs = self.mobilebert( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.mobilebert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1511,11 +1471,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - pooled_output = outputs[1] - + pooled_output = outputs.pooler_output pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) reshaped_logits = logits.view(-1, num_choices) @@ -1525,10 +1483,6 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(reshaped_logits, labels) - if not return_dict: - output = (reshaped_logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return MultipleChoiceModelOutput( loss=loss, logits=reshaped_logits, @@ -1560,6 +1514,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(MOBILEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_TOKEN_CLASSIFICATION, @@ -1579,15 +1534,12 @@ def forward( labels: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]: + ) -> TokenClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.mobilebert( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.mobilebert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1596,11 +1548,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] - + sequence_output = outputs.last_hidden_state sequence_output = self.dropout(sequence_output) logits = self.classifier(sequence_output) @@ -1609,10 +1559,6 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return TokenClassifierOutput( loss=loss, logits=logits, From 893799322ec5bbb0d152ae0503ad6c32b0dc00ee Mon Sep 17 00:00:00 2001 From: qubvel Date: Wed, 9 Apr 2025 16:22:36 +0000 Subject: [PATCH 18/32] roberta (breaks fx) --- .../models/roberta/modeling_roberta.py | 122 +++++------------- 1 file changed, 32 insertions(+), 90 deletions(-) diff --git a/src/transformers/models/roberta/modeling_roberta.py b/src/transformers/models/roberta/modeling_roberta.py index f2dfa19a6a50..1ee772d9c84d 100644 --- a/src/transformers/models/roberta/modeling_roberta.py +++ b/src/transformers/models/roberta/modeling_roberta.py @@ -16,7 +16,7 @@ """PyTorch RoBERTa model.""" import math -from typing import List, Optional, Tuple, Union +from typing import List, Optional, Tuple import torch import torch.utils.checkpoint @@ -46,6 +46,7 @@ add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, + can_return_tuple, get_torch_version, logging, replace_return_docstrings, @@ -584,6 +585,7 @@ def __init__(self, config): self.layer = nn.ModuleList([RobertaLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False + @can_return_tuple def forward( self, hidden_states: torch.Tensor, @@ -595,8 +597,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = False, output_hidden_states: Optional[bool] = False, - return_dict: Optional[bool] = True, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: + ) -> BaseModelOutputWithPastAndCrossAttentions: all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None @@ -649,18 +650,6 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - if not return_dict: - return tuple( - v - for v in [ - hidden_states, - next_decoder_cache, - all_hidden_states, - all_self_attentions, - all_cross_attentions, - ] - if v is not None - ) return BaseModelOutputWithPastAndCrossAttentions( last_hidden_state=hidden_states, past_key_values=next_decoder_cache, @@ -834,6 +823,7 @@ class PreTrainedModel for layer, heads in heads_to_prune.items(): self.encoder.layer[layer].attention.prune_heads(heads) + @can_return_tuple @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -854,8 +844,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: r""" encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if @@ -880,7 +869,6 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if self.config.is_decoder: use_cache = use_cache if use_cache is not None else self.config.use_cache @@ -975,7 +963,7 @@ def forward( # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) - encoder_outputs = self.encoder( + encoder_outputs: BaseModelOutputWithPastAndCrossAttentions = self.encoder( embedding_output, attention_mask=extended_attention_mask, head_mask=head_mask, @@ -985,14 +973,10 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = encoder_outputs[0] + sequence_output = encoder_outputs.last_hidden_state pooled_output = self.pooler(sequence_output) if self.pooler is not None else None - if not return_dict: - return (sequence_output, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPoolingAndCrossAttentions( last_hidden_state=sequence_output, pooler_output=pooled_output, @@ -1027,6 +1011,7 @@ def get_output_embeddings(self): def set_output_embeddings(self, new_embeddings): self.lm_head.decoder = new_embeddings + @can_return_tuple @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC) def forward( @@ -1044,9 +1029,8 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, **kwargs, - ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]: + ) -> CausalLMOutputWithCrossAttentions: r""" encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if @@ -1090,11 +1074,10 @@ def forward( >>> prediction_logits = outputs.logits ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if labels is not None: use_cache = False - outputs = self.roberta( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1107,10 +1090,9 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state prediction_scores = self.lm_head(sequence_output) lm_loss = None @@ -1124,10 +1106,6 @@ def forward( **kwargs, ) - if not return_dict: - output = (prediction_scores,) + outputs[2:] - return ((lm_loss,) + output) if lm_loss is not None else output - return CausalLMOutputWithCrossAttentions( loss=lm_loss, logits=prediction_scores, @@ -1171,6 +1149,7 @@ def get_output_embeddings(self): def set_output_embeddings(self, new_embeddings): self.lm_head.decoder = new_embeddings + @can_return_tuple @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -1193,8 +1172,7 @@ def forward( labels: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]: + ) -> MaskedLMOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., @@ -1203,9 +1181,7 @@ def forward( kwargs (`Dict[str, any]`, *optional*, defaults to `{}`): Used to hide legacy arguments that have been deprecated. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.roberta( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1216,9 +1192,8 @@ def forward( encoder_attention_mask=encoder_attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state prediction_scores = self.lm_head(sequence_output) masked_lm_loss = None @@ -1228,10 +1203,6 @@ def forward( loss_fct = CrossEntropyLoss() masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) - if not return_dict: - output = (prediction_scores,) + outputs[2:] - return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output - return MaskedLMOutput( loss=masked_lm_loss, logits=prediction_scores, @@ -1290,6 +1261,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint="cardiffnlp/twitter-roberta-base-emotion", @@ -1309,17 +1281,14 @@ def forward( labels: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]: + ) -> SequenceClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.roberta( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1328,9 +1297,8 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state logits = self.classifier(sequence_output) loss = None @@ -1358,10 +1326,6 @@ def forward( loss_fct = BCEWithLogitsLoss() loss = loss_fct(logits, labels) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return SequenceClassifierOutput( loss=loss, logits=logits, @@ -1388,6 +1352,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -1405,15 +1370,13 @@ def forward( inputs_embeds: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]: + ) -> MultipleChoiceModelOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above) """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None @@ -1426,7 +1389,7 @@ def forward( else None ) - outputs = self.roberta( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roberta( flat_input_ids, position_ids=flat_position_ids, token_type_ids=flat_token_type_ids, @@ -1435,9 +1398,8 @@ def forward( inputs_embeds=flat_inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - pooled_output = outputs[1] + pooled_output = outputs.pooler_output pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) @@ -1450,10 +1412,6 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(reshaped_logits, labels) - if not return_dict: - output = (reshaped_logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return MultipleChoiceModelOutput( loss=loss, logits=reshaped_logits, @@ -1484,6 +1442,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint="Jean-Baptiste/roberta-large-ner-english", @@ -1503,15 +1462,12 @@ def forward( labels: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]: + ) -> TokenClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.roberta( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1520,11 +1476,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] - + sequence_output = outputs.last_hidden_state sequence_output = self.dropout(sequence_output) logits = self.classifier(sequence_output) @@ -1535,10 +1489,6 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return TokenClassifierOutput( loss=loss, logits=logits, @@ -1587,6 +1537,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint="deepset/roberta-base-squad2", @@ -1607,8 +1558,7 @@ def forward( end_positions: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]: + ) -> QuestionAnsweringModelOutput: r""" start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for position (index) of the start of the labelled span for computing the token classification loss. @@ -1619,9 +1569,7 @@ def forward( Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.roberta( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1630,11 +1578,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] - + sequence_output = outputs.last_hidden_state logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) start_logits = start_logits.squeeze(-1).contiguous() @@ -1657,10 +1603,6 @@ def forward( end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 - if not return_dict: - output = (start_logits, end_logits) + outputs[2:] - return ((total_loss,) + output) if total_loss is not None else output - return QuestionAnsweringModelOutput( loss=total_loss, start_logits=start_logits, From 27f7e188d49702e95d4c198aeaa4d85e7e0ffe83 Mon Sep 17 00:00:00 2001 From: qubvel Date: Wed, 9 Apr 2025 16:34:25 +0000 Subject: [PATCH 19/32] clipseg --- .../models/clipseg/modeling_clipseg.py | 148 +++++------------- 1 file changed, 39 insertions(+), 109 deletions(-) diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py index a24847471f72..0b060d64c024 100644 --- a/src/transformers/models/clipseg/modeling_clipseg.py +++ b/src/transformers/models/clipseg/modeling_clipseg.py @@ -31,6 +31,7 @@ ModelOutput, add_start_docstrings, add_start_docstrings_to_model_forward, + can_return_tuple, logging, replace_return_docstrings, torch_int, @@ -604,6 +605,7 @@ def __init__(self, config: CLIPSegConfig): self.layers = nn.ModuleList([CLIPSegEncoderLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False + @can_return_tuple def forward( self, inputs_embeds, @@ -611,8 +613,7 @@ def forward( causal_attention_mask: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutput]: + ) -> BaseModelOutput: r""" Args: inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): @@ -646,7 +647,6 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict encoder_states = () if output_hidden_states else None all_attentions = () if output_attentions else None @@ -679,8 +679,6 @@ def forward( if output_hidden_states: encoder_states = encoder_states + (hidden_states,) - if not return_dict: - return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) return BaseModelOutput( last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions ) @@ -698,6 +696,7 @@ def __init__(self, config: CLIPSegTextConfig): # For `pooled_output` computation self.eos_token_id = config.eos_token_id + @can_return_tuple @add_start_docstrings_to_model_forward(CLIPSEG_TEXT_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPSegTextConfig) # Adapted from transformers.models.clip.modeling_clip.CLIPTextTransformer.forward with clip->clipseg, CLIP->CLIPSeg @@ -708,8 +707,7 @@ def forward( position_ids: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPooling]: + ) -> BaseModelOutputWithPooling: r""" Returns: @@ -718,7 +716,6 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is None: raise ValueError("You have to specify input_ids") @@ -738,16 +735,15 @@ def forward( # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype) - encoder_outputs = self.encoder( + encoder_outputs: BaseModelOutput = self.encoder( inputs_embeds=hidden_states, attention_mask=attention_mask, causal_attention_mask=causal_attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - last_hidden_state = encoder_outputs[0] + last_hidden_state = encoder_outputs.last_hidden_state last_hidden_state = self.final_layer_norm(last_hidden_state) if self.eos_token_id == 2: @@ -772,9 +768,6 @@ def forward( .argmax(dim=-1), ] - if not return_dict: - return (last_hidden_state, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPooling( last_hidden_state=last_hidden_state, pooler_output=pooled_output, @@ -800,6 +793,7 @@ def get_input_embeddings(self) -> nn.Module: def set_input_embeddings(self, value): self.text_model.embeddings.token_embedding = value + @can_return_tuple @add_start_docstrings_to_model_forward(CLIPSEG_TEXT_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPSegTextConfig) def forward( @@ -809,8 +803,7 @@ def forward( position_ids: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPooling]: + ) -> BaseModelOutputWithPooling: r""" Returns: @@ -834,7 +827,6 @@ def forward( position_ids=position_ids, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) @@ -850,6 +842,7 @@ def __init__(self, config: CLIPSegVisionConfig): self.encoder = CLIPSegEncoder(config) self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) + @can_return_tuple @add_start_docstrings_to_model_forward(CLIPSEG_VISION_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPSegVisionConfig) def forward( @@ -857,7 +850,6 @@ def forward( pixel_values: Optional[torch.FloatTensor], output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, interpolate_pos_encoding: Optional[bool] = True, ) -> Union[Tuple, BaseModelOutputWithPooling]: r""" @@ -868,25 +860,20 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding) hidden_states = self.pre_layrnorm(hidden_states) - encoder_outputs = self.encoder( + encoder_outputs: BaseModelOutput = self.encoder( inputs_embeds=hidden_states, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - last_hidden_state = encoder_outputs[0] + last_hidden_state = encoder_outputs.last_hidden_state pooled_output = last_hidden_state[:, 0, :] pooled_output = self.post_layernorm(pooled_output) - if not return_dict: - return (last_hidden_state, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPooling( last_hidden_state=last_hidden_state, pooler_output=pooled_output, @@ -908,6 +895,7 @@ def __init__(self, config: CLIPSegVisionConfig): def get_input_embeddings(self) -> nn.Module: return self.vision_model.embeddings.patch_embedding + @can_return_tuple @add_start_docstrings_to_model_forward(CLIPSEG_VISION_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPSegVisionConfig) def forward( @@ -916,8 +904,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, interpolate_pos_encoding: Optional[bool] = True, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPooling]: + ) -> BaseModelOutputWithPooling: r""" Returns: @@ -945,7 +932,6 @@ def forward( output_attentions=output_attentions, output_hidden_states=output_hidden_states, interpolate_pos_encoding=interpolate_pos_encoding, - return_dict=return_dict, ) @@ -991,10 +977,7 @@ def get_text_features( input_ids: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> torch.FloatTensor: + ) -> torch.Tensor: r""" Returns: text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by @@ -1011,36 +994,23 @@ def get_text_features( >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt") >>> text_features = model.get_text_features(**inputs) ```""" - # Use CLIPSEG model's config for some fields (if specified) instead of those of vision & text components. - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - text_outputs = self.text_model( + text_outputs: BaseModelOutputWithPooling = self.text_model( input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + output_attentions=False, + output_hidden_states=False, ) - - pooled_output = text_outputs[1] + pooled_output = text_outputs.pooler_output text_features = self.text_projection(pooled_output) - return text_features @add_start_docstrings_to_model_forward(CLIPSEG_VISION_INPUTS_DOCSTRING) def get_image_features( self, pixel_values: Optional[torch.FloatTensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, interpolate_pos_encoding: bool = True, - return_dict: Optional[bool] = None, - ) -> torch.FloatTensor: + ) -> torch.Tensor: r""" Returns: image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by @@ -1063,26 +1033,17 @@ def get_image_features( >>> image_features = model.get_image_features(**inputs) ```""" - # Use CLIPSEG model's config for some fields (if specified) instead of those of vision & text components. - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - vision_outputs = self.vision_model( + vision_outputs: BaseModelOutputWithPooling = self.vision_model( pixel_values=pixel_values, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, + output_attentions=False, + output_hidden_states=False, interpolate_pos_encoding=interpolate_pos_encoding, - return_dict=return_dict, ) - - pooled_output = vision_outputs[1] # pooled_output + pooled_output = vision_outputs.pooler_output image_features = self.visual_projection(pooled_output) - return image_features + @can_return_tuple @add_start_docstrings_to_model_forward(CLIPSEG_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=CLIPSegOutput, config_class=CLIPSegConfig) def forward( @@ -1095,8 +1056,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, interpolate_pos_encoding: bool = True, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, CLIPSegOutput]: + ) -> CLIPSegOutput: r""" Returns: @@ -1126,29 +1086,26 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - vision_outputs = self.vision_model( + vision_outputs: BaseModelOutputWithPooling = self.vision_model( pixel_values=pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, interpolate_pos_encoding=interpolate_pos_encoding, - return_dict=return_dict, ) - text_outputs = self.text_model( + text_outputs: BaseModelOutputWithPooling = self.text_model( input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - image_embeds = vision_outputs[1] + image_embeds = vision_outputs.pooler_output image_embeds = self.visual_projection(image_embeds) - text_embeds = text_outputs[1] + text_embeds = text_outputs.pooler_output text_embeds = self.text_projection(text_embeds) # normalized features @@ -1164,10 +1121,6 @@ def forward( if return_loss: loss = clipseg_loss(logits_per_text) - if not return_dict: - output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs) - return ((loss,) + output) if loss is not None else output - return CLIPSegOutput( loss=loss, logits_per_image=logits_per_image, @@ -1279,13 +1232,13 @@ def __init__(self, config: CLIPSegConfig): decoder_config.hidden_act = "relu" self.layers = nn.ModuleList([CLIPSegDecoderLayer(decoder_config) for _ in range(len(config.extract_layers))]) + @can_return_tuple def forward( self, hidden_states: Tuple[torch.Tensor], conditional_embeddings: torch.Tensor, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = True, ): all_hidden_states = () if output_hidden_states else None all_attentions = () if output_attentions else None @@ -1326,9 +1279,6 @@ def forward( logits = self.transposed_convolution(output).squeeze(1) - if not return_dict: - return tuple(v for v in [logits, all_hidden_states, all_attentions] if v is not None) - return CLIPSegDecoderOutput( logits=logits, hidden_states=all_hidden_states, @@ -1361,10 +1311,10 @@ def __init__(self, config: CLIPSegConfig): def get_conditional_embeddings( self, batch_size: Optional[int] = None, - input_ids: Optional[torch.Tensor] = None, + input_ids: Optional[torch.FloatTensor] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.Tensor] = None, - conditional_pixel_values: Optional[torch.Tensor] = None, + conditional_pixel_values: Optional[torch.FloatTensor] = None, ): if input_ids is not None: # compute conditional embeddings from texts @@ -1387,6 +1337,7 @@ def get_conditional_embeddings( return conditional_embeddings + @can_return_tuple @add_start_docstrings_to_model_forward(CLIPSEG_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=CLIPSegImageSegmentationOutput, config_class=CLIPSegTextConfig) def forward( @@ -1401,8 +1352,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, interpolate_pos_encoding: bool = True, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, CLIPSegOutput]: + ) -> CLIPSegImageSegmentationOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., @@ -1432,36 +1382,21 @@ def forward( >>> print(logits.shape) torch.Size([3, 352, 352]) ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict # step 1: forward the query images through the frozen CLIP vision encoder with torch.no_grad(): - vision_outputs = self.clip.vision_model( + vision_outputs: BaseModelOutputWithPooling = self.clip.vision_model( pixel_values=pixel_values, output_attentions=output_attentions, output_hidden_states=True, # we need the intermediate hidden states interpolate_pos_encoding=interpolate_pos_encoding, - return_dict=return_dict, ) pooled_output = self.clip.visual_projection(vision_outputs[1]) - hidden_states = vision_outputs.hidden_states if return_dict else vision_outputs[2] + hidden_states = vision_outputs.hidden_states # we add +1 here as the hidden states also include the initial embeddings activations = [hidden_states[i + 1] for i in self.extract_layers] - # update vision_outputs - if return_dict: - vision_outputs = BaseModelOutputWithPooling( - last_hidden_state=vision_outputs.last_hidden_state, - pooler_output=vision_outputs.pooler_output, - hidden_states=vision_outputs.hidden_states if output_hidden_states else None, - attentions=vision_outputs.attentions, - ) - else: - vision_outputs = ( - vision_outputs[:2] + vision_outputs[3:] if not output_hidden_states else vision_outputs - ) - # step 2: compute conditional embeddings, either from text, images or an own provided embedding if conditional_embeddings is None: conditional_embeddings = self.get_conditional_embeddings( @@ -1483,14 +1418,13 @@ def forward( ) # step 3: forward both the pooled output and the activations through the lightweight decoder to predict masks - decoder_outputs = self.decoder( + decoder_outputs: CLIPSegDecoderOutput = self.decoder( activations, conditional_embeddings, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - logits = decoder_outputs.logits if return_dict else decoder_outputs[0] + logits = decoder_outputs.logits loss = None if labels is not None: @@ -1499,10 +1433,6 @@ def forward( loss_fn = nn.BCEWithLogitsLoss() loss = loss_fn(logits, labels) - if not return_dict: - output = (logits, conditional_embeddings, pooled_output, vision_outputs, decoder_outputs) - return ((loss,) + output) if loss is not None else output - return CLIPSegImageSegmentationOutput( loss=loss, logits=logits, From 34f1435020680b5b30381731b585b00310edc052 Mon Sep 17 00:00:00 2001 From: qubvel Date: Wed, 9 Apr 2025 16:38:51 +0000 Subject: [PATCH 20/32] git --- src/transformers/models/git/modeling_git.py | 70 ++++++--------------- 1 file changed, 19 insertions(+), 51 deletions(-) diff --git a/src/transformers/models/git/modeling_git.py b/src/transformers/models/git/modeling_git.py index 7efdf2d45c48..c61529bc70b9 100644 --- a/src/transformers/models/git/modeling_git.py +++ b/src/transformers/models/git/modeling_git.py @@ -39,6 +39,7 @@ from ...utils import ( add_start_docstrings, add_start_docstrings_to_model_forward, + can_return_tuple, logging, replace_return_docstrings, torch_int, @@ -404,6 +405,7 @@ def __init__(self, config): self.layer = nn.ModuleList([GitLayer(config, i) for i in range(config.num_hidden_layers)]) self.gradient_checkpointing = False + @can_return_tuple def forward( self, hidden_states: torch.Tensor, @@ -414,8 +416,7 @@ def forward( output_attentions: Optional[bool] = False, output_hidden_states: Optional[bool] = False, pixel_values_present: Optional[bool] = False, - return_dict: Optional[bool] = True, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPast]: + ) -> BaseModelOutputWithPast: if self.gradient_checkpointing and self.training: if use_cache: logger.warning_once( @@ -478,17 +479,6 @@ def forward( if return_legacy_cache: next_cache = next_cache.to_legacy_cache() - if not return_dict: - return tuple( - v - for v in [ - hidden_states, - next_cache, - all_hidden_states, - all_self_attentions, - ] - if v is not None - ) return BaseModelOutputWithPast( last_hidden_state=hidden_states, past_key_values=next_cache, @@ -886,6 +876,7 @@ def __init__(self, config: GitVisionConfig): self.layers = nn.ModuleList([GitVisionEncoderLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False + @can_return_tuple def forward( self, inputs_embeds, @@ -893,8 +884,7 @@ def forward( causal_attention_mask: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutput]: + ) -> BaseModelOutput: r""" Args: inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): @@ -928,7 +918,6 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict encoder_states = () if output_hidden_states else None all_attentions = () if output_attentions else None @@ -961,8 +950,6 @@ def forward( if output_hidden_states: encoder_states = encoder_states + (hidden_states,) - if not return_dict: - return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) return BaseModelOutput( last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions ) @@ -998,6 +985,7 @@ def __init__(self, config: GitVisionConfig): self.encoder = GitVisionEncoder(config) self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) + @can_return_tuple @add_start_docstrings_to_model_forward(GIT_VISION_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=BaseModelOutput, config_class=GitVisionConfig) def forward( @@ -1006,8 +994,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, interpolate_pos_encoding: Optional[bool] = False, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutput]: + ) -> BaseModelOutput: r""" Returns: @@ -1016,7 +1003,6 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -1024,20 +1010,15 @@ def forward( hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding) hidden_states = self.pre_layrnorm(hidden_states) - encoder_outputs = self.encoder( + encoder_outputs: BaseModelOutputWithPast = self.encoder( inputs_embeds=hidden_states, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - last_hidden_state = encoder_outputs[0] - + last_hidden_state = encoder_outputs.last_hidden_state last_hidden_state = self.post_layernorm(last_hidden_state) - if not return_dict: - return (last_hidden_state,) + encoder_outputs[1:] - return BaseModelOutput( last_hidden_state=last_hidden_state, hidden_states=encoder_outputs.hidden_states, @@ -1063,6 +1044,7 @@ def __init__(self, config: GitVisionConfig): def get_input_embeddings(self) -> nn.Module: return self.vision_model.embeddings.patch_embedding + @can_return_tuple @add_start_docstrings_to_model_forward(GIT_VISION_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=BaseModelOutput, config_class=GitVisionConfig) def forward( @@ -1071,8 +1053,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, interpolate_pos_encoding: bool = False, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutput]: + ) -> BaseModelOutput: r""" Returns: @@ -1094,14 +1075,12 @@ def forward( >>> outputs = model(**inputs) >>> last_hidden_state = outputs.last_hidden_state ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict return self.vision_model( pixel_values=pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, interpolate_pos_encoding=interpolate_pos_encoding, - return_dict=return_dict, ) @@ -1213,6 +1192,7 @@ def create_attention_mask(self, tgt, memory, tgt_mask, past_key_values_length, m return full_attention_mask + @can_return_tuple @add_start_docstrings_to_model_forward(GIT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=_CONFIG_FOR_DOC) def forward( @@ -1228,8 +1208,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, interpolate_pos_encoding: bool = False, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPooling]: + ) -> BaseModelOutputWithPast: r""" use_cache (`bool`, *optional*): If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see @@ -1262,7 +1241,6 @@ def forward( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -1363,7 +1341,7 @@ def forward( else: combined_attention_mask[:, :, -input_shape[1] :, -input_shape[1] :] += expanded_attn_mask - encoder_outputs = self.encoder( + encoder_outputs: BaseModelOutputWithPast = self.encoder( hidden_states, attention_mask=combined_attention_mask, head_mask=head_mask, @@ -1371,13 +1349,9 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, pixel_values_present=pixel_values is not None, ) - sequence_output = encoder_outputs[0] - - if not return_dict: - return (sequence_output,) + encoder_outputs[1:] + sequence_output = encoder_outputs.last_hidden_state return BaseModelOutputWithPast( last_hidden_state=sequence_output, @@ -1408,6 +1382,7 @@ def get_output_embeddings(self): def set_output_embeddings(self, new_embeddings): self.output = new_embeddings + @can_return_tuple @add_start_docstrings_to_model_forward(GIT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) def forward( @@ -1424,9 +1399,8 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, interpolate_pos_encoding: bool = False, - return_dict: Optional[bool] = None, **kwargs, - ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithPast]: + ) -> CausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in @@ -1563,11 +1537,10 @@ def forward( Generated caption: ['a woman is sitting at a table and she is talking about the food she is holding.'] ``` """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if labels is not None: use_cache = False - outputs = self.git( + outputs: BaseModelOutputWithPast = self.git( input_ids, attention_mask=attention_mask, position_ids=position_ids, @@ -1579,10 +1552,9 @@ def forward( output_attentions=output_attentions, output_hidden_states=output_hidden_states, interpolate_pos_encoding=interpolate_pos_encoding, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state logits = self.output(sequence_output) loss = None @@ -1598,10 +1570,6 @@ def forward( **kwargs, ) - if not return_dict: - output = (logits,) + outputs[1:] - return ((loss,) + output) if loss is not None else output - return CausalLMOutputWithPast( loss=loss, logits=logits, From 0dc963b57a4a9bd6e65744307ed9c490c6c7b8f6 Mon Sep 17 00:00:00 2001 From: qubvel Date: Wed, 9 Apr 2025 16:40:24 +0000 Subject: [PATCH 21/32] idefics --- src/transformers/models/idefics/vision.py | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/src/transformers/models/idefics/vision.py b/src/transformers/models/idefics/vision.py index 5e9f9b8ad772..c2f32a326a32 100644 --- a/src/transformers/models/idefics/vision.py +++ b/src/transformers/models/idefics/vision.py @@ -16,7 +16,7 @@ import math from dataclasses import dataclass -from typing import Optional, Tuple, Union +from typing import Optional, Tuple import torch import torch.utils.checkpoint @@ -24,7 +24,7 @@ from ...activations import ACT2FN from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling -from ...utils import ModelOutput, logging +from ...utils import ModelOutput, can_return_tuple, logging from .configuration_idefics import IdeficsVisionConfig @@ -348,6 +348,7 @@ def __init__(self, config: IdeficsVisionConfig): self.layers = nn.ModuleList([IdeficsVisionEncoderLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False + @can_return_tuple def forward( self, inputs_embeds, @@ -355,8 +356,7 @@ def forward( causal_attention_mask: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutput]: + ) -> BaseModelOutput: r""" Args: inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): @@ -390,7 +390,6 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict encoder_states = () if output_hidden_states else None all_attentions = () if output_attentions else None @@ -423,8 +422,6 @@ def forward( if output_hidden_states: encoder_states = encoder_states + (hidden_states,) - if not return_dict: - return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) return BaseModelOutput( last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions ) @@ -443,14 +440,14 @@ def __init__(self, config: IdeficsVisionConfig): self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) # Adapted from transformers.models.clip.modeling_clip.CLIPVisionTransformer.forward + @can_return_tuple def forward( self, pixel_values: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, interpolate_pos_encoding: Optional[bool] = False, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPooling]: + ) -> BaseModelOutputWithPooling: r""" Returns: @@ -459,7 +456,6 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -471,16 +467,12 @@ def forward( inputs_embeds=hidden_states, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) last_hidden_state = encoder_outputs[0] pooled_output = last_hidden_state[:, 0, :] pooled_output = self.post_layernorm(pooled_output) - if not return_dict: - return (last_hidden_state, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPooling( last_hidden_state=last_hidden_state, pooler_output=pooled_output, From 34868af5c9c6811b5965deff49afa172e1b23892 Mon Sep 17 00:00:00 2001 From: qubvel Date: Wed, 9 Apr 2025 16:46:40 +0000 Subject: [PATCH 22/32] kosmos2 --- .../models/kosmos2/modeling_kosmos2.py | 97 ++++++------------- 1 file changed, 27 insertions(+), 70 deletions(-) diff --git a/src/transformers/models/kosmos2/modeling_kosmos2.py b/src/transformers/models/kosmos2/modeling_kosmos2.py index 23a1391fa1f1..7ba3f575a778 100644 --- a/src/transformers/models/kosmos2/modeling_kosmos2.py +++ b/src/transformers/models/kosmos2/modeling_kosmos2.py @@ -16,7 +16,7 @@ import math from dataclasses import dataclass -from typing import Any, List, Optional, Tuple, Union +from typing import Any, List, Optional, Tuple import torch import torch.utils.checkpoint @@ -36,6 +36,7 @@ ModelOutput, add_start_docstrings, add_start_docstrings_to_model_forward, + can_return_tuple, logging, replace_return_docstrings, torch_int, @@ -654,6 +655,7 @@ def __init__(self, config: Kosmos2VisionConfig): self.layers = nn.ModuleList([Kosmos2VisionEncoderLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False + @can_return_tuple def forward( self, inputs_embeds, @@ -661,8 +663,7 @@ def forward( causal_attention_mask: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutput]: + ) -> BaseModelOutput: r""" Args: inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): @@ -696,7 +697,6 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict encoder_states = () if output_hidden_states else None all_attentions = () if output_attentions else None @@ -729,8 +729,6 @@ def forward( if output_hidden_states: encoder_states = encoder_states + (hidden_states,) - if not return_dict: - return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) return BaseModelOutput( last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions ) @@ -749,19 +747,18 @@ def __init__(self, config: Kosmos2VisionConfig): self.encoder = Kosmos2VisionEncoder(config) self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) + @can_return_tuple def forward( self, pixel_values: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, interpolate_pos_encoding: bool = False, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPooling]: + ) -> BaseModelOutputWithPooling: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -769,20 +766,16 @@ def forward( hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding) hidden_states = self.pre_layrnorm(hidden_states) - encoder_outputs = self.encoder( + encoder_outputs: BaseModelOutput = self.encoder( inputs_embeds=hidden_states, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) last_hidden_state = encoder_outputs[0] pooled_output = last_hidden_state[:, 0, :] pooled_output = self.post_layernorm(pooled_output) - if not return_dict: - return (last_hidden_state, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPooling( last_hidden_state=last_hidden_state, pooler_output=pooled_output, @@ -1219,6 +1212,7 @@ def forward_embedding( return hidden_states + @can_return_tuple def forward( self, input_ids: Optional[torch.Tensor] = None, @@ -1235,14 +1229,12 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]: + ) -> BaseModelOutputWithPastAndCrossAttentions: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -1360,18 +1352,6 @@ def forward( if output_hidden_states: all_hidden_states += (hidden_states,) - if not return_dict: - return tuple( - v - for v in [ - hidden_states, - present_key_value_states, - all_hidden_states, - all_self_attns, - all_cross_attentions, - ] - if v is not None - ) return BaseModelOutputWithPastAndCrossAttentions( last_hidden_state=hidden_states, past_key_values=present_key_value_states, @@ -1490,6 +1470,7 @@ def __init__(self, config: Kosmos2VisionConfig): def get_input_embeddings(self) -> nn.Module: return self.model.embeddings.patch_embedding + @can_return_tuple @add_start_docstrings_to_model_forward(KOSMOS2_VISION_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=Kosmos2VisionConfig) def forward( @@ -1498,8 +1479,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, interpolate_pos_encoding: bool = False, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPooling]: + ) -> BaseModelOutputWithPooling: r""" Returns: @@ -1509,7 +1489,6 @@ def forward( output_attentions=output_attentions, output_hidden_states=output_hidden_states, interpolate_pos_encoding=interpolate_pos_encoding, - return_dict=return_dict, ) @@ -1528,6 +1507,7 @@ def get_input_embeddings(self) -> nn.Module: def set_input_embeddings(self, value): self.model.embed_tokens = value + @can_return_tuple @add_start_docstrings_to_model_forward(KOSMOS2_TEXT_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=BaseModelOutputWithPastAndCrossAttentions, config_class=Kosmos2TextConfig) def forward( @@ -1546,8 +1526,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]: + ) -> BaseModelOutputWithPastAndCrossAttentions: r""" Returns: @@ -1567,7 +1546,6 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) @@ -1603,6 +1581,7 @@ def get_output_embeddings(self) -> nn.Module: def set_output_embeddings(self, new_embeddings): self.lm_head = new_embeddings + @can_return_tuple @add_start_docstrings_to_model_forward(KOSMOS2_TEXT_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=Kosmos2TextConfig) def forward( @@ -1622,8 +1601,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]: + ) -> CausalLMOutputWithCrossAttentions: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in @@ -1633,14 +1611,13 @@ def forward( Returns: """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if labels is not None: if use_cache: logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.") use_cache = False - outputs = self.model( + outputs: BaseModelOutputWithPastAndCrossAttentions = self.model( input_ids=input_ids, attention_mask=attention_mask, image_embeds=image_embeds, @@ -1655,9 +1632,8 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - lm_logits = self.lm_head(outputs[0]) + lm_logits = self.lm_head(outputs.last_hidden_state) loss = None if labels is not None: @@ -1673,10 +1649,6 @@ def forward( shift_logits.view(batch_size * seq_length, vocab_size), shift_labels.view(batch_size * seq_length) ) - if not return_dict: - output = (lm_logits,) + outputs[1:] - return (loss,) + output if loss is not None else output - return CausalLMOutputWithCrossAttentions( loss=loss, logits=lm_logits, @@ -1807,6 +1779,7 @@ def get_input_embeddings(self) -> nn.Module: def set_input_embeddings(self, value): self.text_model.model.embed_tokens = value + @can_return_tuple @add_start_docstrings_to_model_forward(KOSMOS2_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=Kosmos2ModelOutput, config_class=_CONFIG_FOR_DOC) def forward( @@ -1824,8 +1797,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, interpolate_pos_encoding: bool = False, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, Kosmos2ModelOutput]: + ) -> Kosmos2ModelOutput: r""" Returns: @@ -1863,7 +1835,6 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict vision_model_output = None projection_attentions = None @@ -1871,20 +1842,19 @@ def forward( if pixel_values is None: raise ValueError("You have to specify either `pixel_values` or `image_embeds`.") - vision_model_output = self.vision_model( + vision_model_output: BaseModelOutputWithPooling = self.vision_model( pixel_values=pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, interpolate_pos_encoding=interpolate_pos_encoding, - return_dict=return_dict, ) # The whole `last_hidden_state` through `post_layernorm` instead of just `pooled_output`. - image_embeds = self.vision_model.model.post_layernorm(vision_model_output[0]) + image_embeds = self.vision_model.model.post_layernorm(vision_model_output.last_hidden_state) # normalized features image_embeds = nn.functional.normalize(image_embeds, dim=-1) image_embeds, projection_attentions = self.image_to_text_projection(image_embeds) - outputs = self.text_model( + outputs: BaseModelOutputWithPastAndCrossAttentions = self.text_model( input_ids=input_ids, attention_mask=attention_mask, image_embeds=image_embeds, @@ -1896,13 +1866,8 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - if not return_dict: - outputs = outputs + (image_embeds, projection_attentions, vision_model_output) - return tuple(output for output in outputs if output is not None) - return Kosmos2ModelOutput( last_hidden_state=outputs.last_hidden_state, past_key_values=outputs.past_key_values, @@ -1949,6 +1914,7 @@ def get_output_embeddings(self) -> nn.Module: def set_output_embeddings(self, new_embeddings): self.text_model.set_output_embeddings(new_embeddings) + @can_return_tuple @add_start_docstrings_to_model_forward(KOSMOS2_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=Kosmos2ForConditionalGenerationModelOutput, config_class=_CONFIG_FOR_DOC) def forward( @@ -1966,8 +1932,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, Kosmos2ForConditionalGenerationModelOutput]: + ) -> Kosmos2ForConditionalGenerationModelOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in @@ -2018,7 +1983,6 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict vision_model_output = None projection_attentions = None @@ -2026,19 +1990,17 @@ def forward( if pixel_values is None: raise ValueError("You have to specify either `pixel_values` or `image_embeds`.") - vision_model_output = self.vision_model( + vision_model_output: BaseModelOutputWithPooling = self.vision_model( pixel_values=pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) # The whole `last_hidden_state` through `post_layernorm` instead of just `pooled_output`. - image_embeds = self.vision_model.model.post_layernorm(vision_model_output[0]) - # normalized features + image_embeds = self.vision_model.model.post_layernorm(vision_model_output.last_hidden_state) image_embeds = nn.functional.normalize(image_embeds, dim=-1) image_embeds, projection_attentions = self.image_to_text_projection(image_embeds) - lm_outputs = self.text_model( + lm_outputs: CausalLMOutputWithCrossAttentions = self.text_model( input_ids=input_ids, attention_mask=attention_mask, image_embeds=image_embeds, @@ -2051,13 +2013,8 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - if not return_dict: - outputs = lm_outputs + (image_embeds, projection_attentions, vision_model_output) - return tuple(output for output in outputs if output is not None) - return Kosmos2ForConditionalGenerationModelOutput( loss=lm_outputs.loss, logits=lm_outputs.logits, From 45517f38c0e5b0c0df19f415b6302168dd4ff70a Mon Sep 17 00:00:00 2001 From: qubvel Date: Wed, 9 Apr 2025 16:53:25 +0000 Subject: [PATCH 23/32] x_clip --- .../models/x_clip/modeling_x_clip.py | 115 ++++++------------ 1 file changed, 39 insertions(+), 76 deletions(-) diff --git a/src/transformers/models/x_clip/modeling_x_clip.py b/src/transformers/models/x_clip/modeling_x_clip.py index f85b4636cdf4..de3d7102691b 100644 --- a/src/transformers/models/x_clip/modeling_x_clip.py +++ b/src/transformers/models/x_clip/modeling_x_clip.py @@ -16,7 +16,7 @@ from copy import copy from dataclasses import dataclass -from typing import Any, Optional, Tuple, Union +from typing import Any, Optional, Tuple import torch import torch.utils.checkpoint @@ -30,6 +30,7 @@ ModelOutput, add_start_docstrings, add_start_docstrings_to_model_forward, + can_return_tuple, logging, replace_return_docstrings, torch_int, @@ -685,6 +686,7 @@ def __init__(self, config: XCLIPConfig): self.layers = nn.ModuleList([XCLIPEncoderLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False + @can_return_tuple def forward( self, inputs_embeds, @@ -692,8 +694,7 @@ def forward( causal_attention_mask: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutput]: + ) -> BaseModelOutput: r""" Args: inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): @@ -727,7 +728,6 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict encoder_states = () if output_hidden_states else None all_attentions = () if output_attentions else None @@ -760,8 +760,6 @@ def forward( if output_hidden_states: encoder_states = encoder_states + (hidden_states,) - if not return_dict: - return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) return BaseModelOutput( last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions ) @@ -776,6 +774,7 @@ def __init__(self, config: XCLIPTextConfig): self.encoder = XCLIPEncoder(config) self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) + @can_return_tuple @add_start_docstrings_to_model_forward(X_CLIP_TEXT_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=XCLIPTextConfig) def forward( @@ -785,8 +784,7 @@ def forward( position_ids: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPooling]: + ) -> BaseModelOutputWithPooling: r""" Returns: @@ -795,7 +793,6 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is None: raise ValueError("You have to specify either input_ids") @@ -815,25 +812,21 @@ def forward( # [batch_size, seq_len] -> [batch_size, 1, tgt_seq_len, src_seq_len] attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype) - encoder_outputs = self.encoder( + encoder_outputs: BaseModelOutput = self.encoder( inputs_embeds=hidden_states, attention_mask=attention_mask, causal_attention_mask=causal_attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - last_hidden_state = encoder_outputs[0] + last_hidden_state = encoder_outputs.last_hidden_state last_hidden_state = self.final_layer_norm(last_hidden_state) # text_embeds.shape = [batch_size, sequence_length, transformer.width] # take features from the eot embedding (eot_token is the highest number in each sequence) pooled_output = last_hidden_state[torch.arange(last_hidden_state.shape[0]), input_ids.argmax(dim=-1)] - if not return_dict: - return (last_hidden_state, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPooling( last_hidden_state=last_hidden_state, pooler_output=pooled_output, @@ -857,6 +850,7 @@ def get_input_embeddings(self) -> nn.Module: def set_input_embeddings(self, value): self.text_model.embeddings.token_embedding = value + @can_return_tuple @add_start_docstrings_to_model_forward(X_CLIP_TEXT_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=XCLIPTextConfig) def forward( @@ -866,8 +860,7 @@ def forward( position_ids: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPooling]: + ) -> BaseModelOutputWithPooling: r""" Returns: @@ -891,7 +884,6 @@ def forward( position_ids=position_ids, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) @@ -910,6 +902,7 @@ def __init__(self, config: XCLIPConfig): self.layers = nn.ModuleList([XCLIPVisionEncoderLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False + @can_return_tuple def forward( self, inputs_embeds, @@ -917,8 +910,7 @@ def forward( causal_attention_mask: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutput]: + ) -> BaseModelOutput: r""" Args: inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): @@ -952,7 +944,6 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict encoder_states = () if output_hidden_states else None all_attentions = () if output_attentions else None @@ -985,8 +976,6 @@ def forward( if output_hidden_states: encoder_states = encoder_states + (hidden_states,) - if not return_dict: - return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) return BaseModelOutput( last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions ) @@ -1007,6 +996,7 @@ def __init__(self, config: XCLIPVisionConfig): self.encoder = XCLIPVisionEncoder(config) self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) + @can_return_tuple @add_start_docstrings_to_model_forward(X_CLIP_VISION_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=XCLIPVisionConfig) def forward( @@ -1015,8 +1005,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, interpolate_pos_encoding: bool = False, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPooling]: + ) -> BaseModelOutputWithPooling: r""" Returns: @@ -1025,25 +1014,20 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding) hidden_states = self.pre_layernorm(hidden_states) - encoder_outputs = self.encoder( + encoder_outputs: BaseModelOutput = self.encoder( inputs_embeds=hidden_states, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - last_hidden_state = encoder_outputs[0] + last_hidden_state = encoder_outputs.last_hidden_state pooled_output = last_hidden_state[:, 0, :] pooled_output = self.post_layernorm(pooled_output) - if not return_dict: - return (last_hidden_state, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPooling( last_hidden_state=last_hidden_state, pooler_output=pooled_output, @@ -1065,6 +1049,7 @@ def __init__(self, config: XCLIPVisionConfig): def get_input_embeddings(self) -> nn.Module: return self.vision_model.embeddings.patch_embedding + @can_return_tuple @add_start_docstrings_to_model_forward(X_CLIP_VISION_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=XCLIPVisionConfig) def forward( @@ -1072,8 +1057,7 @@ def forward( pixel_values: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPooling]: + ) -> BaseModelOutputWithPooling: r""" Returns: @@ -1154,7 +1138,6 @@ def forward( pixel_values=pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) @@ -1169,33 +1152,27 @@ def __init__(self, config: XCLIPVisionConfig): self.position_embedding = nn.Parameter(torch.empty(1, config.num_frames, config.hidden_size)) self.encoder = XCLIPEncoder(config) + @can_return_tuple def forward( self, - hidden_states, + hidden_states: torch.Tensor, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutput]: + ) -> BaseModelOutputWithPooling: residual = hidden_states # add position embeddings hidden_states = hidden_states + self.position_embedding - encoder_outputs = self.encoder( + encoder_outputs: BaseModelOutput = self.encoder( inputs_embeds=hidden_states, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - last_hidden_state = encoder_outputs[0] - + last_hidden_state = encoder_outputs.last_hidden_state last_hidden_state = last_hidden_state.type(hidden_states.dtype) + residual - pooled_output = last_hidden_state.mean(dim=1, keepdim=False) - if not return_dict: - return (last_hidden_state, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPooling( last_hidden_state=last_hidden_state, pooler_output=pooled_output, @@ -1343,6 +1320,7 @@ def __init__(self, config: XCLIPConfig): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(X_CLIP_TEXT_INPUTS_DOCSTRING) def get_text_features( self, @@ -1351,7 +1329,6 @@ def get_text_features( position_ids: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, ) -> torch.FloatTensor: r""" Returns: @@ -1374,18 +1351,16 @@ def get_text_features( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - text_outputs = self.text_model( + text_outputs: BaseModelOutputWithPooling = self.text_model( input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - text_embeds = text_outputs[1] + text_embeds = text_outputs.pooler_output text_embeds = self.text_projection(text_embeds) return text_embeds @@ -1396,7 +1371,6 @@ def get_video_features( pixel_values: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, ) -> torch.FloatTensor: r""" Returns: @@ -1478,33 +1452,31 @@ def get_video_features( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict batch_size, num_frames, num_channels, height, width = pixel_values.shape pixel_values = pixel_values.reshape(-1, num_channels, height, width) - vision_outputs = self.vision_model( + vision_outputs: BaseModelOutputWithPooling = self.vision_model( pixel_values=pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - video_embeds = vision_outputs[1] + video_embeds = vision_outputs.pooler_output video_embeds = self.visual_projection(video_embeds) cls_features = video_embeds.view(batch_size, num_frames, -1) - mit_outputs = self.mit( + mit_outputs: BaseModelOutputWithPooling = self.mit( cls_features, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - video_embeds = mit_outputs[1] + video_embeds = mit_outputs.pooler_output return video_embeds + @can_return_tuple @add_start_docstrings_to_model_forward(X_CLIP_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=XCLIPOutput, config_class=XCLIPConfig) def forward( @@ -1517,8 +1489,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, interpolate_pos_encoding: bool = False, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, XCLIPOutput]: + ) -> XCLIPOutput: r""" Returns: @@ -1608,48 +1579,44 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict batch_size, num_frames, num_channels, height, width = pixel_values.shape pixel_values = pixel_values.reshape(-1, num_channels, height, width) - vision_outputs = self.vision_model( + vision_outputs: BaseModelOutputWithPooling = self.vision_model( pixel_values=pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, interpolate_pos_encoding=interpolate_pos_encoding, - return_dict=return_dict, ) - video_embeds = vision_outputs[1] + video_embeds = vision_outputs.pooler_output video_embeds = self.visual_projection(video_embeds) cls_features = video_embeds.view(batch_size, num_frames, -1) - mit_outputs = self.mit( + mit_outputs: BaseModelOutputWithPooling = self.mit( cls_features, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - video_embeds = mit_outputs[1] + video_embeds = mit_outputs.pooler_output - img_features = vision_outputs[0][:, 1:, :] + img_features = vision_outputs.last_hidden_state[:, 1:, :] img_features = self.prompts_visual_layernorm(img_features) img_features = img_features @ self.prompts_visual_projection img_features = img_features.view(batch_size, num_frames, -1, video_embeds.shape[-1]) img_features = img_features.mean(dim=1, keepdim=False) - text_outputs = self.text_model( + text_outputs: BaseModelOutputWithPooling = self.text_model( input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - text_embeds = text_outputs[1] + text_embeds = text_outputs.pooler_output text_embeds = self.text_projection(text_embeds) text_embeds = text_embeds.unsqueeze(0).expand(batch_size, -1, -1) @@ -1668,10 +1635,6 @@ def forward( if return_loss: loss = x_clip_loss(logits_per_text) - if not return_dict: - output = (logits_per_video, logits_per_text, text_embeds, video_embeds, text_outputs, vision_outputs) - return ((loss,) + output) if loss is not None else output - return XCLIPOutput( loss=loss, logits_per_video=logits_per_video, From 33238d177ad2ef276c4dfb7439ed1e845420c4cc Mon Sep 17 00:00:00 2001 From: qubvel Date: Wed, 9 Apr 2025 16:58:02 +0000 Subject: [PATCH 24/32] roberta_prelayernorm --- .../modeling_roberta_prelayernorm.py | 122 +++++------------- 1 file changed, 32 insertions(+), 90 deletions(-) diff --git a/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py b/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py index 6b0c40b222c1..56d42ac1bdcd 100644 --- a/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +++ b/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py @@ -16,7 +16,7 @@ """PyTorch RoBERTa-PreLayerNorm model.""" import math -from typing import List, Optional, Tuple, Union +from typing import List, Optional, Tuple import torch import torch.utils.checkpoint @@ -41,6 +41,7 @@ add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, + can_return_tuple, logging, replace_return_docstrings, ) @@ -468,6 +469,7 @@ def __init__(self, config): self.layer = nn.ModuleList([RobertaPreLayerNormLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False + @can_return_tuple def forward( self, hidden_states: torch.Tensor, @@ -479,8 +481,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = False, output_hidden_states: Optional[bool] = False, - return_dict: Optional[bool] = True, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: + ) -> BaseModelOutputWithPastAndCrossAttentions: all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None @@ -533,18 +534,6 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - if not return_dict: - return tuple( - v - for v in [ - hidden_states, - next_decoder_cache, - all_hidden_states, - all_self_attentions, - all_cross_attentions, - ] - if v is not None - ) return BaseModelOutputWithPastAndCrossAttentions( last_hidden_state=hidden_states, past_key_values=next_decoder_cache, @@ -715,6 +704,7 @@ class PreTrainedModel for layer, heads in heads_to_prune.items(): self.encoder.layer[layer].attention.prune_heads(heads) + @can_return_tuple @add_start_docstrings_to_model_forward(ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -735,8 +725,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: r""" encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if @@ -761,7 +750,6 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if self.config.is_decoder: use_cache = use_cache if use_cache is not None else self.config.use_cache @@ -824,7 +812,7 @@ def forward( inputs_embeds=inputs_embeds, past_key_values_length=past_key_values_length, ) - encoder_outputs = self.encoder( + encoder_outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.encoder( embedding_output, attention_mask=extended_attention_mask, head_mask=head_mask, @@ -834,15 +822,11 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = encoder_outputs[0] + sequence_output = encoder_outputs.last_hidden_state sequence_output = self.LayerNorm(sequence_output) pooled_output = self.pooler(sequence_output) if self.pooler is not None else None - if not return_dict: - return (sequence_output, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPoolingAndCrossAttentions( last_hidden_state=sequence_output, pooler_output=pooled_output, @@ -881,6 +865,7 @@ def get_output_embeddings(self): def set_output_embeddings(self, new_embeddings): self.lm_head.decoder = new_embeddings + @can_return_tuple @add_start_docstrings_to_model_forward(ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC) def forward( @@ -898,9 +883,8 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, **kwargs, - ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]: + ) -> CausalLMOutputWithCrossAttentions: r""" encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if @@ -944,11 +928,10 @@ def forward( >>> prediction_logits = outputs.logits ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if labels is not None: use_cache = False - outputs = self.roberta_prelayernorm( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roberta_prelayernorm( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -961,10 +944,9 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state prediction_scores = self.lm_head(sequence_output) lm_loss = None @@ -978,10 +960,6 @@ def forward( **kwargs, ) - if not return_dict: - output = (prediction_scores,) + outputs[2:] - return ((lm_loss,) + output) if lm_loss is not None else output - return CausalLMOutputWithCrossAttentions( loss=lm_loss, logits=prediction_scores, @@ -1028,6 +1006,7 @@ def get_output_embeddings(self): def set_output_embeddings(self, new_embeddings): self.lm_head.decoder = new_embeddings + @can_return_tuple @add_start_docstrings_to_model_forward(ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -1051,8 +1030,7 @@ def forward( labels: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]: + ) -> MaskedLMOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., @@ -1061,9 +1039,7 @@ def forward( kwargs (`Dict[str, any]`, *optional*, defaults to `{}`): Used to hide legacy arguments that have been deprecated. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.roberta_prelayernorm( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roberta_prelayernorm( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1074,9 +1050,8 @@ def forward( encoder_attention_mask=encoder_attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state prediction_scores = self.lm_head(sequence_output) masked_lm_loss = None @@ -1086,10 +1061,6 @@ def forward( loss_fct = CrossEntropyLoss() masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) - if not return_dict: - output = (prediction_scores,) + outputs[2:] - return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output - return MaskedLMOutput( loss=masked_lm_loss, logits=prediction_scores, @@ -1149,6 +1120,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -1167,17 +1139,14 @@ def forward( labels: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]: + ) -> SequenceClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.roberta_prelayernorm( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roberta_prelayernorm( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1186,9 +1155,8 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state logits = self.classifier(sequence_output) loss = None @@ -1216,10 +1184,6 @@ def forward( loss_fct = BCEWithLogitsLoss() loss = loss_fct(logits, labels) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return SequenceClassifierOutput( loss=loss, logits=logits, @@ -1247,6 +1211,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward( ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length") ) @@ -1266,15 +1231,13 @@ def forward( inputs_embeds: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]: + ) -> MultipleChoiceModelOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above) """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None @@ -1287,7 +1250,7 @@ def forward( else None ) - outputs = self.roberta_prelayernorm( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roberta_prelayernorm( flat_input_ids, position_ids=flat_position_ids, token_type_ids=flat_token_type_ids, @@ -1296,9 +1259,8 @@ def forward( inputs_embeds=flat_inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - pooled_output = outputs[1] + pooled_output = outputs.pooler_output pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) @@ -1311,10 +1273,6 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(reshaped_logits, labels) - if not return_dict: - output = (reshaped_logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return MultipleChoiceModelOutput( loss=loss, logits=reshaped_logits, @@ -1345,6 +1303,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -1363,15 +1322,12 @@ def forward( labels: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]: + ) -> TokenClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.roberta_prelayernorm( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roberta_prelayernorm( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1380,11 +1336,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] - + sequence_output = outputs.last_hidden_state sequence_output = self.dropout(sequence_output) logits = self.classifier(sequence_output) @@ -1395,10 +1349,6 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return TokenClassifierOutput( loss=loss, logits=logits, @@ -1448,6 +1398,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -1467,8 +1418,7 @@ def forward( end_positions: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]: + ) -> QuestionAnsweringModelOutput: r""" start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for position (index) of the start of the labelled span for computing the token classification loss. @@ -1479,9 +1429,7 @@ def forward( Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.roberta_prelayernorm( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roberta_prelayernorm( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1490,11 +1438,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] - + sequence_output = outputs.last_hidden_state logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) start_logits = start_logits.squeeze(-1).contiguous() @@ -1517,10 +1463,6 @@ def forward( end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 - if not return_dict: - output = (start_logits, end_logits) + outputs[2:] - return ((total_loss,) + output) if total_loss is not None else output - return QuestionAnsweringModelOutput( loss=total_loss, start_logits=start_logits, From dfa269b7321dc3dec29fb7e26059dc5f6b679724 Mon Sep 17 00:00:00 2001 From: qubvel Date: Wed, 9 Apr 2025 17:03:27 +0000 Subject: [PATCH 25/32] roc_bert --- .../models/roc_bert/modeling_roc_bert.py | 142 +++++------------- 1 file changed, 41 insertions(+), 101 deletions(-) diff --git a/src/transformers/models/roc_bert/modeling_roc_bert.py b/src/transformers/models/roc_bert/modeling_roc_bert.py index b5ca264fb73d..d2d6c5aaf29b 100644 --- a/src/transformers/models/roc_bert/modeling_roc_bert.py +++ b/src/transformers/models/roc_bert/modeling_roc_bert.py @@ -16,7 +16,7 @@ import math import os -from typing import List, Optional, Tuple, Union +from typing import List, Optional, Tuple import torch import torch.utils.checkpoint @@ -41,6 +41,7 @@ add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, + can_return_tuple, logging, replace_return_docstrings, ) @@ -613,6 +614,7 @@ def __init__(self, config): self.layer = nn.ModuleList([RoCBertLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False + @can_return_tuple def forward( self, hidden_states: torch.Tensor, @@ -624,8 +626,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = False, output_hidden_states: Optional[bool] = False, - return_dict: Optional[bool] = True, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: + ) -> BaseModelOutputWithPastAndCrossAttentions: all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None @@ -678,18 +679,6 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - if not return_dict: - return tuple( - v - for v in [ - hidden_states, - next_decoder_cache, - all_hidden_states, - all_self_attentions, - all_cross_attentions, - ] - if v is not None - ) return BaseModelOutputWithPastAndCrossAttentions( last_hidden_state=hidden_states, past_key_values=next_decoder_cache, @@ -932,6 +921,7 @@ class PreTrainedModel for layer, heads in heads_to_prune.items(): self.encoder.layer[layer].attention.prune_heads(heads) + @can_return_tuple @add_start_docstrings_to_model_forward(ROC_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -955,8 +945,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: r""" encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if @@ -980,7 +969,6 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if self.config.is_decoder: use_cache = use_cache if use_cache is not None else self.config.use_cache @@ -1045,7 +1033,7 @@ def forward( inputs_embeds=inputs_embeds, past_key_values_length=past_key_values_length, ) - encoder_outputs = self.encoder( + encoder_outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.encoder( embedding_output, attention_mask=extended_attention_mask, head_mask=head_mask, @@ -1055,14 +1043,10 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = encoder_outputs[0] + sequence_output = encoder_outputs.last_hidden_state pooled_output = self.pooler(sequence_output) if self.pooler is not None else None - if not return_dict: - return (sequence_output, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPoolingAndCrossAttentions( last_hidden_state=sequence_output, pooler_output=pooled_output, @@ -1100,6 +1084,7 @@ def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings self.cls.predictions.bias = new_embeddings.bias + @can_return_tuple @add_start_docstrings_to_model_forward(ROC_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC) def forward( @@ -1124,9 +1109,8 @@ def forward( labels_token_type_ids: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, **kwargs, - ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]: + ) -> MaskedLMOutput: r""" attack_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): attack sample ids for computing the contrastive loss. Indices should be in `[-100, 0, ..., @@ -1185,9 +1169,8 @@ def forward( torch.Size([1, 11, 21128]) ``` """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.roc_bert( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roc_bert( input_ids, input_shape_ids=input_shape_ids, input_pronunciation_ids=input_pronunciation_ids, @@ -1198,10 +1181,10 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output, pooled_output = outputs[:2] + sequence_output = outputs.last_hidden_state + pooled_output = outputs.pooler_output prediction_scores = self.cls(sequence_output) loss = None @@ -1216,25 +1199,23 @@ def forward( target_inputs = torch.clone(labels_input_ids) target_inputs[target_inputs == -100] = self.config.pad_token_id - labels_output = self.roc_bert( + labels_output: BaseModelOutputWithPoolingAndCrossAttentions = self.roc_bert( target_inputs, input_shape_ids=labels_input_shape_ids, input_pronunciation_ids=labels_input_pronunciation_ids, attention_mask=labels_attention_mask, token_type_ids=labels_token_type_ids, - return_dict=return_dict, ) - attack_output = self.roc_bert( + attack_output: BaseModelOutputWithPoolingAndCrossAttentions = self.roc_bert( attack_input_ids, input_shape_ids=attack_input_shape_ids, input_pronunciation_ids=attack_input_pronunciation_ids, attention_mask=attack_attention_mask, token_type_ids=attack_token_type_ids, - return_dict=return_dict, ) - labels_pooled_output = labels_output[1] - attack_pooled_output = attack_output[1] + labels_pooled_output = labels_output.pooler_output + attack_pooled_output = attack_output.pooler_output pooled_output_norm = torch.nn.functional.normalize(pooled_output, dim=-1) labels_pooled_output_norm = torch.nn.functional.normalize(labels_pooled_output, dim=-1) @@ -1252,10 +1233,6 @@ def forward( else: loss = masked_lm_loss - if not return_dict: - output = (prediction_scores,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return MaskedLMOutput( loss=loss, logits=prediction_scores, @@ -1293,6 +1270,7 @@ def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings self.cls.predictions.bias = new_embeddings.bias + @can_return_tuple @add_start_docstrings_to_model_forward(ROC_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) def forward( self, @@ -1309,8 +1287,7 @@ def forward( labels: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]: + ) -> MaskedLMOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., @@ -1338,9 +1315,8 @@ def forward( '.' ``` """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.roc_bert( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roc_bert( input_ids, input_shape_ids=input_shape_ids, input_pronunciation_ids=input_pronunciation_ids, @@ -1353,10 +1329,9 @@ def forward( encoder_attention_mask=encoder_attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state prediction_scores = self.cls(sequence_output) masked_lm_loss = None @@ -1364,10 +1339,6 @@ def forward( loss_fct = CrossEntropyLoss() # -100 index = padding token masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) - if not return_dict: - output = (prediction_scores,) + outputs[2:] - return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output - return MaskedLMOutput( loss=masked_lm_loss, logits=prediction_scores, @@ -1431,6 +1402,7 @@ def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings self.cls.predictions.bias = new_embeddings.bias + @can_return_tuple @add_start_docstrings_to_model_forward(ROC_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC) def forward( @@ -1450,9 +1422,8 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, **kwargs, - ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]: + ) -> CausalLMOutputWithCrossAttentions: r""" encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if @@ -1502,9 +1473,8 @@ def forward( >>> prediction_logits = outputs.logits ``` """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.roc_bert( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roc_bert( input_ids, input_shape_ids=input_shape_ids, input_pronunciation_ids=input_pronunciation_ids, @@ -1519,10 +1489,9 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state prediction_scores = self.cls(sequence_output) lm_loss = None @@ -1534,10 +1503,6 @@ def forward( **kwargs, ) - if not return_dict: - output = (prediction_scores,) + outputs[2:] - return ((lm_loss,) + output) if lm_loss is not None else output - return CausalLMOutputWithCrossAttentions( loss=lm_loss, logits=prediction_scores, @@ -1621,6 +1586,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(ROC_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION, @@ -1642,17 +1608,15 @@ def forward( labels: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]: + ) -> SequenceClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.roc_bert( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roc_bert( input_ids, input_shape_ids=input_shape_ids, input_pronunciation_ids=input_pronunciation_ids, @@ -1663,10 +1627,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - pooled_output = outputs[1] + pooled_output = outputs.pooler_output pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) @@ -1693,9 +1656,6 @@ def forward( elif self.config.problem_type == "multi_label_classification": loss_fct = BCEWithLogitsLoss() loss = loss_fct(logits, labels) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output return SequenceClassifierOutput( loss=loss, @@ -1725,6 +1685,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward( ROC_BERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length") ) @@ -1746,15 +1707,13 @@ def forward( labels: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]: + ) -> MultipleChoiceModelOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above) """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None @@ -1773,7 +1732,7 @@ def forward( else None ) - outputs = self.roc_bert( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roc_bert( input_ids, input_shape_ids=input_shape_ids, input_pronunciation_ids=input_pronunciation_ids, @@ -1784,10 +1743,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - pooled_output = outputs[1] + pooled_output = outputs.pooler_output pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) @@ -1798,10 +1756,6 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(reshaped_logits, labels) - if not return_dict: - output = (reshaped_logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return MultipleChoiceModelOutput( loss=loss, logits=reshaped_logits, @@ -1831,6 +1785,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(ROC_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_TOKEN_CLASSIFICATION, @@ -1852,15 +1807,12 @@ def forward( labels: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, TokenClassifierOutput]: + ) -> TokenClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.roc_bert( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roc_bert( input_ids, input_shape_ids=input_shape_ids, input_pronunciation_ids=input_pronunciation_ids, @@ -1871,10 +1823,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state sequence_output = self.dropout(sequence_output) logits = self.classifier(sequence_output) @@ -1884,10 +1835,6 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return TokenClassifierOutput( loss=loss, logits=logits, @@ -1913,6 +1860,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(ROC_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_QA, @@ -1937,8 +1885,7 @@ def forward( end_positions: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]: + ) -> QuestionAnsweringModelOutput: r""" start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for position (index) of the start of the labelled span for computing the token classification loss. @@ -1949,9 +1896,7 @@ def forward( Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.roc_bert( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roc_bert( input_ids, input_shape_ids=input_shape_ids, input_pronunciation_ids=input_pronunciation_ids, @@ -1962,10 +1907,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) @@ -1989,10 +1933,6 @@ def forward( end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 - if not return_dict: - output = (start_logits, end_logits) + outputs[2:] - return ((total_loss,) + output) if total_loss is not None else output - return QuestionAnsweringModelOutput( loss=total_loss, start_logits=start_logits, From c5092d6d4599cc5f719673b13b3e45522a5312c2 Mon Sep 17 00:00:00 2001 From: qubvel Date: Wed, 9 Apr 2025 17:05:47 +0000 Subject: [PATCH 26/32] xlm_roberta --- .../xlm_roberta/modeling_xlm_roberta.py | 122 +++++------------- 1 file changed, 32 insertions(+), 90 deletions(-) diff --git a/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py b/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py index 1fe5823c2066..2c5523fe61d4 100644 --- a/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py +++ b/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py @@ -16,7 +16,7 @@ """PyTorch XLM-RoBERTa model.""" import math -from typing import List, Optional, Tuple, Union +from typing import List, Optional, Tuple import torch import torch.utils.checkpoint @@ -46,6 +46,7 @@ add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, + can_return_tuple, get_torch_version, logging, replace_return_docstrings, @@ -585,6 +586,7 @@ def __init__(self, config): self.layer = nn.ModuleList([XLMRobertaLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False + @can_return_tuple def forward( self, hidden_states: torch.Tensor, @@ -596,8 +598,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = False, output_hidden_states: Optional[bool] = False, - return_dict: Optional[bool] = True, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: + ) -> BaseModelOutputWithPastAndCrossAttentions: all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None @@ -650,18 +651,6 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - if not return_dict: - return tuple( - v - for v in [ - hidden_states, - next_decoder_cache, - all_hidden_states, - all_self_attentions, - all_cross_attentions, - ] - if v is not None - ) return BaseModelOutputWithPastAndCrossAttentions( last_hidden_state=hidden_states, past_key_values=next_decoder_cache, @@ -835,6 +824,7 @@ class PreTrainedModel for layer, heads in heads_to_prune.items(): self.encoder.layer[layer].attention.prune_heads(heads) + @can_return_tuple @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -855,8 +845,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: r""" encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if @@ -881,7 +870,6 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if self.config.is_decoder: use_cache = use_cache if use_cache is not None else self.config.use_cache @@ -976,7 +964,7 @@ def forward( # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) - encoder_outputs = self.encoder( + encoder_outputs: BaseModelOutputWithPastAndCrossAttentions = self.encoder( embedding_output, attention_mask=extended_attention_mask, head_mask=head_mask, @@ -986,14 +974,10 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = encoder_outputs[0] + sequence_output = encoder_outputs.last_hidden_state pooled_output = self.pooler(sequence_output) if self.pooler is not None else None - if not return_dict: - return (sequence_output, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPoolingAndCrossAttentions( last_hidden_state=sequence_output, pooler_output=pooled_output, @@ -1030,6 +1014,7 @@ def get_output_embeddings(self): def set_output_embeddings(self, new_embeddings): self.lm_head.decoder = new_embeddings + @can_return_tuple @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC) def forward( @@ -1047,9 +1032,8 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, **kwargs, - ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]: + ) -> CausalLMOutputWithCrossAttentions: r""" encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if @@ -1093,11 +1077,10 @@ def forward( >>> prediction_logits = outputs.logits ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if labels is not None: use_cache = False - outputs = self.roberta( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1110,10 +1093,9 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state prediction_scores = self.lm_head(sequence_output) lm_loss = None @@ -1127,10 +1109,6 @@ def forward( **kwargs, ) - if not return_dict: - output = (prediction_scores,) + outputs[2:] - return ((lm_loss,) + output) if lm_loss is not None else output - return CausalLMOutputWithCrossAttentions( loss=lm_loss, logits=prediction_scores, @@ -1178,6 +1156,7 @@ def get_output_embeddings(self): def set_output_embeddings(self, new_embeddings): self.lm_head.decoder = new_embeddings + @can_return_tuple @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -1200,8 +1179,7 @@ def forward( labels: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]: + ) -> MaskedLMOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., @@ -1210,9 +1188,7 @@ def forward( kwargs (`Dict[str, any]`, *optional*, defaults to `{}`): Used to hide legacy arguments that have been deprecated. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.roberta( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1223,9 +1199,8 @@ def forward( encoder_attention_mask=encoder_attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state prediction_scores = self.lm_head(sequence_output) masked_lm_loss = None @@ -1235,10 +1210,6 @@ def forward( loss_fct = CrossEntropyLoss() masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) - if not return_dict: - output = (prediction_scores,) + outputs[2:] - return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output - return MaskedLMOutput( loss=masked_lm_loss, logits=prediction_scores, @@ -1299,6 +1270,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint="cardiffnlp/twitter-roberta-base-emotion", @@ -1318,17 +1290,14 @@ def forward( labels: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]: + ) -> SequenceClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.roberta( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1337,9 +1306,8 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state logits = self.classifier(sequence_output) loss = None @@ -1367,10 +1335,6 @@ def forward( loss_fct = BCEWithLogitsLoss() loss = loss_fct(logits, labels) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return SequenceClassifierOutput( loss=loss, logits=logits, @@ -1398,6 +1362,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward( XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length") ) @@ -1417,15 +1382,13 @@ def forward( inputs_embeds: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]: + ) -> MultipleChoiceModelOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above) """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None @@ -1438,7 +1401,7 @@ def forward( else None ) - outputs = self.roberta( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roberta( flat_input_ids, position_ids=flat_position_ids, token_type_ids=flat_token_type_ids, @@ -1447,9 +1410,8 @@ def forward( inputs_embeds=flat_inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - pooled_output = outputs[1] + pooled_output = outputs.pooler_output pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) @@ -1462,10 +1424,6 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(reshaped_logits, labels) - if not return_dict: - output = (reshaped_logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return MultipleChoiceModelOutput( loss=loss, logits=reshaped_logits, @@ -1497,6 +1455,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint="Jean-Baptiste/roberta-large-ner-english", @@ -1516,15 +1475,12 @@ def forward( labels: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]: + ) -> TokenClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.roberta( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1533,11 +1489,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] - + sequence_output = outputs.last_hidden_state sequence_output = self.dropout(sequence_output) logits = self.classifier(sequence_output) @@ -1548,10 +1502,6 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return TokenClassifierOutput( loss=loss, logits=logits, @@ -1602,6 +1552,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint="deepset/roberta-base-squad2", @@ -1622,8 +1573,7 @@ def forward( end_positions: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]: + ) -> QuestionAnsweringModelOutput: r""" start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for position (index) of the start of the labelled span for computing the token classification loss. @@ -1634,9 +1584,7 @@ def forward( Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.roberta( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1645,11 +1593,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] - + sequence_output = outputs.last_hidden_state logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) start_logits = start_logits.squeeze(-1).contiguous() @@ -1672,10 +1618,6 @@ def forward( end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 - if not return_dict: - output = (start_logits, end_logits) + outputs[2:] - return ((total_loss,) + output) if total_loss is not None else output - return QuestionAnsweringModelOutput( loss=total_loss, start_logits=start_logits, From 7716f7855dfcb8f87ce6fd654e9a974e9f619231 Mon Sep 17 00:00:00 2001 From: qubvel Date: Wed, 9 Apr 2025 17:11:09 +0000 Subject: [PATCH 27/32] xlm_roberta_xl --- .../xlm_roberta_xl/modeling_xlm_roberta_xl.py | 120 +++++------------- 1 file changed, 32 insertions(+), 88 deletions(-) diff --git a/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py b/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py index ad43c7903f4f..85577a767693 100644 --- a/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +++ b/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py @@ -15,7 +15,7 @@ """PyTorch XLM RoBERTa xl,xxl model.""" import math -from typing import List, Optional, Tuple, Union +from typing import List, Optional, Tuple import torch import torch.utils.checkpoint @@ -45,6 +45,7 @@ add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, + can_return_tuple, get_torch_version, logging, replace_return_docstrings, @@ -578,6 +579,7 @@ def __init__(self, config): self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.gradient_checkpointing = False + @can_return_tuple def forward( self, hidden_states, @@ -589,8 +591,7 @@ def forward( use_cache=None, output_attentions=False, output_hidden_states=False, - return_dict=True, - ): + ) -> BaseModelOutputWithPastAndCrossAttentions: if self.gradient_checkpointing and self.training: if use_cache: logger.warning_once( @@ -644,18 +645,6 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - if not return_dict: - return tuple( - v - for v in [ - hidden_states, - next_decoder_cache, - all_hidden_states, - all_self_attentions, - all_cross_attentions, - ] - if v is not None - ) return BaseModelOutputWithPastAndCrossAttentions( last_hidden_state=hidden_states, past_key_values=next_decoder_cache, @@ -816,6 +805,7 @@ class PreTrainedModel for layer, heads in heads_to_prune.items(): self.encoder.layer[layer].attention.prune_heads(heads) + @can_return_tuple @add_start_docstrings_to_model_forward(XLM_ROBERTA_XL_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -836,8 +826,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: r""" encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if @@ -862,7 +851,6 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if self.config.is_decoder: use_cache = use_cache if use_cache is not None else self.config.use_cache @@ -957,7 +945,7 @@ def forward( # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) - encoder_outputs = self.encoder( + encoder_outputs: BaseModelOutputWithPastAndCrossAttentions = self.encoder( embedding_output, attention_mask=extended_attention_mask, head_mask=head_mask, @@ -967,14 +955,10 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = encoder_outputs[0] + sequence_output = encoder_outputs.last_hidden_state pooled_output = self.pooler(sequence_output) if self.pooler is not None else None - if not return_dict: - return (sequence_output, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPoolingAndCrossAttentions( last_hidden_state=sequence_output, pooler_output=pooled_output, @@ -1010,6 +994,7 @@ def set_output_embeddings(self, new_embeddings): self.lm_head.decoder = new_embeddings self.lm_head.bias = new_embeddings.bias + @can_return_tuple @add_start_docstrings_to_model_forward(XLM_ROBERTA_XL_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC) def forward( @@ -1027,9 +1012,8 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, **kwargs, - ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]: + ) -> CausalLMOutputWithCrossAttentions: r""" encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if @@ -1070,11 +1054,10 @@ def forward( >>> prediction_logits = outputs.logits ``` """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if labels is not None: use_cache = False - outputs = self.roberta( + outputs: BaseModelOutputWithPastAndCrossAttentions = self.roberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1087,10 +1070,9 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state prediction_scores = self.lm_head(sequence_output) lm_loss = None @@ -1102,10 +1084,6 @@ def forward( **kwargs, ) - if not return_dict: - output = (prediction_scores,) + outputs[2:] - return ((lm_loss,) + output) if lm_loss is not None else output - return CausalLMOutputWithCrossAttentions( loss=lm_loss, logits=prediction_scores, @@ -1188,6 +1166,7 @@ def set_output_embeddings(self, new_embeddings): self.lm_head.decoder = new_embeddings self.lm_head.bias = new_embeddings.bias + @can_return_tuple @add_start_docstrings_to_model_forward(XLM_ROBERTA_XL_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -1208,8 +1187,7 @@ def forward( labels: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, MaskedLMOutput]: + ) -> MaskedLMOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., @@ -1218,9 +1196,7 @@ def forward( kwargs (`Dict[str, any]`, *optional*, defaults to `{}`): Used to hide legacy arguments that have been deprecated. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.roberta( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1231,9 +1207,8 @@ def forward( encoder_attention_mask=encoder_attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state prediction_scores = self.lm_head(sequence_output) masked_lm_loss = None @@ -1241,10 +1216,6 @@ def forward( loss_fct = CrossEntropyLoss() masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) - if not return_dict: - output = (prediction_scores,) + outputs[2:] - return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output - return MaskedLMOutput( loss=masked_lm_loss, logits=prediction_scores, @@ -1302,6 +1273,7 @@ def __init__(self, config): self.init_weights() + @can_return_tuple @add_start_docstrings_to_model_forward(XLM_ROBERTA_XL_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -1319,17 +1291,15 @@ def forward( labels: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, SequenceClassifierOutput]: + ) -> SequenceClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.roberta( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1338,9 +1308,8 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state logits = self.classifier(sequence_output) loss = None @@ -1366,10 +1335,6 @@ def forward( loss_fct = BCEWithLogitsLoss() loss = loss_fct(logits, labels) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return SequenceClassifierOutput( loss=loss, logits=logits, @@ -1395,6 +1360,7 @@ def __init__(self, config): self.init_weights() + @can_return_tuple @add_start_docstrings_to_model_forward( XLM_ROBERTA_XL_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length") ) @@ -1414,15 +1380,13 @@ def forward( inputs_embeds: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, MultipleChoiceModelOutput]: + ) -> MultipleChoiceModelOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above) """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None @@ -1435,7 +1399,7 @@ def forward( else None ) - outputs = self.roberta( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roberta( flat_input_ids, position_ids=flat_position_ids, token_type_ids=flat_token_type_ids, @@ -1444,9 +1408,8 @@ def forward( inputs_embeds=flat_inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - pooled_output = outputs[1] + pooled_output = outputs.pooler_output pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) @@ -1457,10 +1420,6 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(reshaped_logits, labels) - if not return_dict: - output = (reshaped_logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return MultipleChoiceModelOutput( loss=loss, logits=reshaped_logits, @@ -1490,6 +1449,7 @@ def __init__(self, config): self.init_weights() + @can_return_tuple @add_start_docstrings_to_model_forward(XLM_ROBERTA_XL_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -1507,15 +1467,12 @@ def forward( labels: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, TokenClassifierOutput]: + ) -> TokenClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.roberta( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1524,11 +1481,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] - + sequence_output = outputs.last_hidden_state sequence_output = self.dropout(sequence_output) logits = self.classifier(sequence_output) @@ -1546,10 +1501,6 @@ def forward( else: loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return TokenClassifierOutput( loss=loss, logits=logits, @@ -1597,6 +1548,7 @@ def __init__(self, config): self.init_weights() + @can_return_tuple @add_start_docstrings_to_model_forward(XLM_ROBERTA_XL_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -1615,8 +1567,7 @@ def forward( end_positions: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, QuestionAnsweringModelOutput]: + ) -> QuestionAnsweringModelOutput: r""" start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for position (index) of the start of the labelled span for computing the token classification loss. @@ -1627,9 +1578,7 @@ def forward( Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.roberta( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1638,10 +1587,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) @@ -1665,10 +1613,6 @@ def forward( end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 - if not return_dict: - output = (start_logits, end_logits) + outputs[2:] - return ((total_loss,) + output) if total_loss is not None else output - return QuestionAnsweringModelOutput( loss=total_loss, start_logits=start_logits, From 6225eaacf93b6e68300816ec5ec320518c8bf546 Mon Sep 17 00:00:00 2001 From: qubvel Date: Wed, 9 Apr 2025 17:14:07 +0000 Subject: [PATCH 28/32] splinter --- .../models/splinter/modeling_splinter.py | 67 ++++++------------- 1 file changed, 22 insertions(+), 45 deletions(-) diff --git a/src/transformers/models/splinter/modeling_splinter.py b/src/transformers/models/splinter/modeling_splinter.py index 174e766598a0..2fcd33d14386 100755 --- a/src/transformers/models/splinter/modeling_splinter.py +++ b/src/transformers/models/splinter/modeling_splinter.py @@ -16,7 +16,7 @@ import math from dataclasses import dataclass -from typing import List, Optional, Tuple, Union +from typing import List, Optional, Tuple import torch import torch.utils.checkpoint @@ -27,7 +27,13 @@ from ...modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, ModelOutput, QuestionAnsweringModelOutput from ...modeling_utils import PreTrainedModel from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer -from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging +from ...utils import ( + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + can_return_tuple, + logging, +) from .configuration_splinter import SplinterConfig @@ -424,6 +430,7 @@ def __init__(self, config): self.layer = nn.ModuleList([SplinterLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False + @can_return_tuple def forward( self, hidden_states: torch.Tensor, @@ -435,8 +442,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = False, output_hidden_states: Optional[bool] = False, - return_dict: Optional[bool] = True, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: + ) -> BaseModelOutputWithPastAndCrossAttentions: all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None @@ -489,18 +495,6 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - if not return_dict: - return tuple( - v - for v in [ - hidden_states, - next_decoder_cache, - all_hidden_states, - all_self_attentions, - all_cross_attentions, - ] - if v is not None - ) return BaseModelOutputWithPastAndCrossAttentions( last_hidden_state=hidden_states, past_key_values=next_decoder_cache, @@ -633,6 +627,7 @@ class PreTrainedModel for layer, heads in heads_to_prune.items(): self.encoder.layer[layer].attention.prune_heads(heads) + @can_return_tuple @add_start_docstrings_to_model_forward(SPLINTER_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -653,8 +648,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]: + ) -> BaseModelOutputWithPastAndCrossAttentions: r""" encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if @@ -678,7 +672,6 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if self.config.is_decoder: use_cache = use_cache if use_cache is not None else self.config.use_cache @@ -735,7 +728,7 @@ def forward( inputs_embeds=inputs_embeds, past_key_values_length=past_key_values_length, ) - encoder_outputs = self.encoder( + encoder_outputs: BaseModelOutputWithPastAndCrossAttentions = self.encoder( embedding_output, attention_mask=extended_attention_mask, head_mask=head_mask, @@ -745,12 +738,8 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = encoder_outputs[0] - - if not return_dict: - return (sequence_output,) + encoder_outputs[1:] + sequence_output = encoder_outputs.last_hidden_state return BaseModelOutputWithPastAndCrossAttentions( last_hidden_state=sequence_output, @@ -835,6 +824,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(SPLINTER_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -853,9 +843,8 @@ def forward( end_positions: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, question_positions: Optional[torch.LongTensor] = None, - ) -> Union[Tuple, QuestionAnsweringModelOutput]: + ) -> QuestionAnsweringModelOutput: r""" start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for position (index) of the start of the labelled span for computing the token classification loss. @@ -871,7 +860,6 @@ def forward( the only one for which start_logits and end_logits are calculated and they will be of shape `(batch_size, sequence_length)`. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict question_positions_were_none = False if question_positions is None: @@ -886,7 +874,7 @@ def forward( question_positions = question_position_for_each_example.unsqueeze(-1) question_positions_were_none = True - outputs = self.splinter( + outputs: BaseModelOutputWithPastAndCrossAttentions = self.splinter( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -895,10 +883,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state start_logits, end_logits = self.splinter_qass(sequence_output, question_positions) if question_positions_were_none: @@ -925,10 +912,6 @@ def forward( end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 - if not return_dict: - output = (start_logits, end_logits) + outputs[1:] - return ((total_loss,) + output) if total_loss is not None else output - return QuestionAnsweringModelOutput( loss=total_loss, start_logits=start_logits, @@ -989,6 +972,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward( SPLINTER_INPUTS_DOCSTRING.format("batch_size, num_questions, sequence_length") ) @@ -1004,9 +988,8 @@ def forward( end_positions: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, question_positions: Optional[torch.LongTensor] = None, - ) -> Union[Tuple, SplinterForPreTrainingOutput]: + ) -> SplinterForPreTrainingOutput: r""" start_positions (`torch.LongTensor` of shape `(batch_size, num_questions)`, *optional*): Labels for position (index) of the start of the labelled span for computing the token classification loss. @@ -1022,7 +1005,6 @@ def forward( the only one for which start_logits and end_logits are calculated and they will be of shape `(batch_size, sequence_length)`. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if question_positions is None and start_positions is not None and end_positions is not None: raise TypeError("question_positions must be specified in order to calculate the loss") @@ -1033,7 +1015,7 @@ def forward( elif question_positions is None: question_positions = self._prepare_question_positions(input_ids) - outputs = self.splinter( + outputs: BaseModelOutputWithPastAndCrossAttentions = self.splinter( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1042,10 +1024,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state batch_size, sequence_length, dim = sequence_output.size() # [batch_size, num_questions, sequence_length] start_logits, end_logits = self.splinter_qass(sequence_output, question_positions) @@ -1080,10 +1061,6 @@ def forward( ) total_loss = (start_loss + end_loss) / 2 - if not return_dict: - output = (start_logits, end_logits) + outputs[1:] - return ((total_loss,) + output) if total_loss is not None else output - return SplinterForPreTrainingOutput( loss=total_loss, start_logits=start_logits, From 506b299d37baee9159b9bceb85d8a6561071a1e9 Mon Sep 17 00:00:00 2001 From: qubvel Date: Wed, 9 Apr 2025 17:17:06 +0000 Subject: [PATCH 29/32] fix-copies for mobilebert --- .../models/mobilebert/modeling_mobilebert.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/transformers/models/mobilebert/modeling_mobilebert.py b/src/transformers/models/mobilebert/modeling_mobilebert.py index ff0f5f43797c..7a8fbd06d72b 100644 --- a/src/transformers/models/mobilebert/modeling_mobilebert.py +++ b/src/transformers/models/mobilebert/modeling_mobilebert.py @@ -1215,7 +1215,7 @@ def forward( """, MOBILEBERT_START_DOCSTRING, ) -# Copied from transformers.models.bert.modeling_bert.BertForSequenceClassification with Bert->MobileBert all-casing +# Copied from transformers.models.bert.modeling_bert.BertForSequenceClassification with Bert->MobileBert all-casing, BaseModelOutputWithPoolingAndCrossAttentions->BaseModelOutputWithPooling class MobileBertForSequenceClassification(MobileBertPreTrainedModel): def __init__(self, config): super().__init__(config) @@ -1259,7 +1259,7 @@ def forward( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.mobilebert( + outputs: BaseModelOutputWithPooling = self.mobilebert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1312,7 +1312,7 @@ def forward( """, MOBILEBERT_START_DOCSTRING, ) -# Copied from transformers.models.bert.modeling_bert.BertForQuestionAnswering with Bert->MobileBert all-casing +# Copied from transformers.models.bert.modeling_bert.BertForQuestionAnswering with Bert->MobileBert all-casing, BaseModelOutputWithPoolingAndCrossAttentions->BaseModelOutputWithPooling class MobileBertForQuestionAnswering(MobileBertPreTrainedModel): def __init__(self, config): super().__init__(config) @@ -1358,7 +1358,7 @@ def forward( Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. """ - outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.mobilebert( + outputs: BaseModelOutputWithPooling = self.mobilebert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1408,7 +1408,7 @@ def forward( """, MOBILEBERT_START_DOCSTRING, ) -# Copied from transformers.models.bert.modeling_bert.BertForMultipleChoice with Bert->MobileBert all-casing +# Copied from transformers.models.bert.modeling_bert.BertForMultipleChoice with Bert->MobileBert all-casing, BaseModelOutputWithPoolingAndCrossAttentions->BaseModelOutputWithPooling class MobileBertForMultipleChoice(MobileBertPreTrainedModel): def __init__(self, config): super().__init__(config) @@ -1462,7 +1462,7 @@ def forward( else None ) - outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.mobilebert( + outputs: BaseModelOutputWithPooling = self.mobilebert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1498,7 +1498,7 @@ def forward( """, MOBILEBERT_START_DOCSTRING, ) -# Copied from transformers.models.bert.modeling_bert.BertForTokenClassification with Bert->MobileBert all-casing +# Copied from transformers.models.bert.modeling_bert.BertForTokenClassification with Bert->MobileBert all-casing, BaseModelOutputWithPoolingAndCrossAttentions->BaseModelOutputWithPooling class MobileBertForTokenClassification(MobileBertPreTrainedModel): def __init__(self, config): super().__init__(config) @@ -1539,7 +1539,7 @@ def forward( labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. """ - outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.mobilebert( + outputs: BaseModelOutputWithPooling = self.mobilebert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, From b3876a6f7343b1a7b635e0f4848570281ca94025 Mon Sep 17 00:00:00 2001 From: qubvel Date: Fri, 18 Apr 2025 16:23:58 +0000 Subject: [PATCH 30/32] Fixup --- src/transformers/models/altclip/modeling_altclip.py | 3 +-- src/transformers/models/idefics/vision.py | 9 ++------- src/transformers/models/kosmos2/modeling_kosmos2.py | 3 +-- src/transformers/models/x_clip/modeling_x_clip.py | 3 +-- 4 files changed, 5 insertions(+), 13 deletions(-) diff --git a/src/transformers/models/altclip/modeling_altclip.py b/src/transformers/models/altclip/modeling_altclip.py index 8d9be13465e8..28b2fa7fe288 100755 --- a/src/transformers/models/altclip/modeling_altclip.py +++ b/src/transformers/models/altclip/modeling_altclip.py @@ -16,8 +16,7 @@ import math from dataclasses import dataclass -from typing import Any, List, Optional, Tuple -from typing import Any, Callable, List, Optional, Tuple, Union +from typing import Any, Callable, List, Optional, Tuple import torch import torch.nn as nn diff --git a/src/transformers/models/idefics/vision.py b/src/transformers/models/idefics/vision.py index a3523f77b817..f23445665251 100644 --- a/src/transformers/models/idefics/vision.py +++ b/src/transformers/models/idefics/vision.py @@ -16,8 +16,7 @@ import math from dataclasses import dataclass -from typing import Optional, Tuple -from typing import Callable, Optional, Tuple, Union +from typing import Callable, Optional, Tuple import torch import torch.utils.checkpoint @@ -25,12 +24,8 @@ from ...activations import ACT2FN from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling -from ...utils import ModelOutput, can_return_tuple, logging from ...modeling_utils import ALL_ATTENTION_FUNCTIONS -from ...utils import ( - ModelOutput, - logging, -) +from ...utils import ModelOutput, can_return_tuple, logging from .configuration_idefics import IdeficsVisionConfig diff --git a/src/transformers/models/kosmos2/modeling_kosmos2.py b/src/transformers/models/kosmos2/modeling_kosmos2.py index e9946bd0ba60..7cb6657b2cb3 100644 --- a/src/transformers/models/kosmos2/modeling_kosmos2.py +++ b/src/transformers/models/kosmos2/modeling_kosmos2.py @@ -16,8 +16,7 @@ import math from dataclasses import dataclass -from typing import Any, List, Optional, Tuple -from typing import Any, Callable, List, Optional, Tuple, Union +from typing import Any, Callable, List, Optional, Tuple import torch import torch.utils.checkpoint diff --git a/src/transformers/models/x_clip/modeling_x_clip.py b/src/transformers/models/x_clip/modeling_x_clip.py index 838b7e07bb2c..07cc76e7acfa 100644 --- a/src/transformers/models/x_clip/modeling_x_clip.py +++ b/src/transformers/models/x_clip/modeling_x_clip.py @@ -16,8 +16,7 @@ import copy from dataclasses import dataclass -from typing import Any, Optional, Tuple -from typing import Any, Callable, Optional, Tuple, Union +from typing import Any, Callable, Optional, Tuple import torch import torch.utils.checkpoint From a0618668f8a053812b9e0950e89ce5b866692414 Mon Sep 17 00:00:00 2001 From: qubvel Date: Fri, 18 Apr 2025 16:30:02 +0000 Subject: [PATCH 31/32] Remove fx-compatibility for bert, electra, roberta, mobilebert --- tests/models/bert/test_modeling_bert.py | 1 - tests/models/electra/test_modeling_electra.py | 1 - tests/models/mobilebert/test_modeling_mobilebert.py | 1 - tests/models/roberta/test_modeling_roberta.py | 1 - 4 files changed, 4 deletions(-) diff --git a/tests/models/bert/test_modeling_bert.py b/tests/models/bert/test_modeling_bert.py index a6aac8e3829a..92893fbddd0e 100644 --- a/tests/models/bert/test_modeling_bert.py +++ b/tests/models/bert/test_modeling_bert.py @@ -463,7 +463,6 @@ class BertModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin if is_torch_available() else {} ) - fx_compatible = True model_split_percents = [0.5, 0.8, 0.9] # special case for ForPreTraining model diff --git a/tests/models/electra/test_modeling_electra.py b/tests/models/electra/test_modeling_electra.py index 7d451ff6378a..9029501934f1 100644 --- a/tests/models/electra/test_modeling_electra.py +++ b/tests/models/electra/test_modeling_electra.py @@ -403,7 +403,6 @@ class ElectraModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase) if is_torch_available() else {} ) - fx_compatible = True # special case for ForPreTraining model def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): diff --git a/tests/models/mobilebert/test_modeling_mobilebert.py b/tests/models/mobilebert/test_modeling_mobilebert.py index 126631fd9ce4..a2dd0a295041 100644 --- a/tests/models/mobilebert/test_modeling_mobilebert.py +++ b/tests/models/mobilebert/test_modeling_mobilebert.py @@ -282,7 +282,6 @@ class MobileBertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa if is_torch_available() else {} ) - fx_compatible = True # special case for ForPreTraining model def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): diff --git a/tests/models/roberta/test_modeling_roberta.py b/tests/models/roberta/test_modeling_roberta.py index 4f4d93b07f4d..f3adeafb17ba 100644 --- a/tests/models/roberta/test_modeling_roberta.py +++ b/tests/models/roberta/test_modeling_roberta.py @@ -392,7 +392,6 @@ class RobertaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi if is_torch_available() else {} ) - fx_compatible = True model_split_percents = [0.5, 0.8, 0.9] def setUp(self): From a1ab1e07395ce236d12eab5e4a7ea98636b70d1a Mon Sep 17 00:00:00 2001 From: qubvel Date: Fri, 18 Apr 2025 16:43:40 +0000 Subject: [PATCH 32/32] trigger