diff --git a/src/transformers/models/albert/modeling_albert.py b/src/transformers/models/albert/modeling_albert.py index 68abb317ee3f..b199a087326a 100755 --- a/src/transformers/models/albert/modeling_albert.py +++ b/src/transformers/models/albert/modeling_albert.py @@ -46,6 +46,7 @@ add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, + can_return_tuple, logging, replace_return_docstrings, ) @@ -508,6 +509,7 @@ def __init__(self, config: AlbertConfig): self.embedding_hidden_mapping_in = nn.Linear(config.embedding_size, config.hidden_size) self.albert_layer_groups = nn.ModuleList([AlbertLayerGroup(config) for _ in range(config.num_hidden_groups)]) + @can_return_tuple def forward( self, hidden_states: torch.Tensor, @@ -515,8 +517,7 @@ def forward( head_mask: Optional[torch.FloatTensor] = None, output_attentions: bool = False, output_hidden_states: bool = False, - return_dict: bool = True, - ) -> Union[BaseModelOutput, Tuple]: + ) -> BaseModelOutput: hidden_states = self.embedding_hidden_mapping_in(hidden_states) all_hidden_states = (hidden_states,) if output_hidden_states else None @@ -546,8 +547,6 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - if not return_dict: - return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None) return BaseModelOutput( last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions ) @@ -733,6 +732,7 @@ def _prune_heads(self, heads_to_prune: Dict[int, List[int]]) -> None: inner_group_idx = int(layer - group_idx * self.config.inner_group_num) self.encoder.albert_layer_groups[group_idx].albert_layers[inner_group_idx].attention.prune_heads(heads) + @can_return_tuple @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -749,13 +749,11 @@ def forward( inputs_embeds: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[BaseModelOutputWithPooling, Tuple]: + ) -> BaseModelOutputWithPooling: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -802,22 +800,18 @@ def forward( head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) - encoder_outputs = self.encoder( + encoder_outputs: BaseModelOutput = self.encoder( embedding_output, extended_attention_mask, head_mask=head_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = encoder_outputs[0] + sequence_output = encoder_outputs.last_hidden_state pooled_output = self.pooler_activation(self.pooler(sequence_output[:, 0])) if self.pooler is not None else None - if not return_dict: - return (sequence_output, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPooling( last_hidden_state=sequence_output, pooler_output=pooled_output, @@ -855,6 +849,7 @@ def set_output_embeddings(self, new_embeddings: nn.Linear) -> None: def get_input_embeddings(self) -> nn.Embedding: return self.albert.embeddings.word_embeddings + @can_return_tuple @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=AlbertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) def forward( @@ -869,8 +864,7 @@ def forward( sentence_order_label: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[AlbertForPreTrainingOutput, Tuple]: + ) -> AlbertForPreTrainingOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., @@ -899,9 +893,8 @@ def forward( >>> prediction_logits = outputs.prediction_logits >>> sop_logits = outputs.sop_logits ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.albert( + outputs: BaseModelOutputWithPooling = self.albert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -910,10 +903,10 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output, pooled_output = outputs[:2] + sequence_output = outputs.last_hidden_state + pooled_output = outputs.pooler_output prediction_scores = self.predictions(sequence_output) sop_scores = self.sop_classifier(pooled_output) @@ -925,10 +918,6 @@ def forward( sentence_order_loss = loss_fct(sop_scores.view(-1, 2), sentence_order_label.view(-1)) total_loss = masked_lm_loss + sentence_order_loss - if not return_dict: - output = (prediction_scores, sop_scores) + outputs[2:] - return ((total_loss,) + output) if total_loss is not None else output - return AlbertForPreTrainingOutput( loss=total_loss, prediction_logits=prediction_scores, @@ -1007,6 +996,7 @@ def set_output_embeddings(self, new_embeddings: nn.Linear) -> None: def get_input_embeddings(self) -> nn.Embedding: return self.albert.embeddings.word_embeddings + @can_return_tuple @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC) def forward( @@ -1020,8 +1010,7 @@ def forward( labels: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[MaskedLMOutput, Tuple]: + ) -> MaskedLMOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., @@ -1059,9 +1048,8 @@ def forward( 0.81 ``` """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.albert( + outputs: BaseModelOutputWithPooling = self.albert( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1070,10 +1058,8 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_outputs = outputs[0] - + sequence_outputs = outputs.last_hidden_state prediction_scores = self.predictions(sequence_outputs) masked_lm_loss = None @@ -1081,10 +1067,6 @@ def forward( loss_fct = CrossEntropyLoss() masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) - if not return_dict: - output = (prediction_scores,) + outputs[2:] - return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output - return MaskedLMOutput( loss=masked_lm_loss, logits=prediction_scores, @@ -1113,6 +1095,7 @@ def __init__(self, config: AlbertConfig): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint="textattack/albert-base-v2-imdb", @@ -1132,17 +1115,15 @@ def forward( labels: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[SequenceClassifierOutput, Tuple]: + ) -> SequenceClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.albert( + outputs: BaseModelOutputWithPooling = self.albert( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1151,10 +1132,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - pooled_output = outputs[1] + pooled_output = outputs.pooler_output pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) @@ -1182,10 +1162,6 @@ def forward( loss_fct = BCEWithLogitsLoss() loss = loss_fct(logits, labels) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return SequenceClassifierOutput( loss=loss, logits=logits, @@ -1218,6 +1194,7 @@ def __init__(self, config: AlbertConfig): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -1235,15 +1212,13 @@ def forward( labels: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[TokenClassifierOutput, Tuple]: + ) -> TokenClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.albert( + outputs: BaseModelOutputWithPooling = self.albert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1252,10 +1227,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state sequence_output = self.dropout(sequence_output) logits = self.classifier(sequence_output) @@ -1265,10 +1239,6 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return TokenClassifierOutput( loss=loss, logits=logits, @@ -1295,6 +1265,7 @@ def __init__(self, config: AlbertConfig): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint="twmkn9/albert-base-v2-squad2", @@ -1317,8 +1288,7 @@ def forward( end_positions: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[AlbertForPreTrainingOutput, Tuple]: + ) -> QuestionAnsweringModelOutput: r""" start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for position (index) of the start of the labelled span for computing the token classification loss. @@ -1329,9 +1299,8 @@ def forward( Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.albert( + outputs: BaseModelOutputWithPooling = self.albert( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1340,10 +1309,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state logits: torch.Tensor = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) @@ -1367,10 +1335,6 @@ def forward( end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 - if not return_dict: - output = (start_logits, end_logits) + outputs[2:] - return ((total_loss,) + output) if total_loss is not None else output - return QuestionAnsweringModelOutput( loss=total_loss, start_logits=start_logits, @@ -1398,6 +1362,7 @@ def __init__(self, config: AlbertConfig): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -1415,15 +1380,13 @@ def forward( labels: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[AlbertForPreTrainingOutput, Tuple]: + ) -> MultipleChoiceModelOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices-1]` where *num_choices* is the size of the second dimension of the input tensors. (see *input_ids* above) """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None @@ -1435,7 +1398,7 @@ def forward( if inputs_embeds is not None else None ) - outputs = self.albert( + outputs: BaseModelOutputWithPooling = self.albert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1444,10 +1407,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - pooled_output = outputs[1] + pooled_output = outputs.pooler_output pooled_output = self.dropout(pooled_output) logits: torch.Tensor = self.classifier(pooled_output) @@ -1458,10 +1420,6 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(reshaped_logits, labels) - if not return_dict: - output = (reshaped_logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return MultipleChoiceModelOutput( loss=loss, logits=reshaped_logits, diff --git a/src/transformers/models/align/modeling_align.py b/src/transformers/models/align/modeling_align.py index a007b7a7c6d6..a45d65b5ba2e 100644 --- a/src/transformers/models/align/modeling_align.py +++ b/src/transformers/models/align/modeling_align.py @@ -35,6 +35,7 @@ ModelOutput, add_start_docstrings, add_start_docstrings_to_model_forward, + can_return_tuple, logging, replace_return_docstrings, ) @@ -643,11 +644,11 @@ def round_repeats(repeats): self.blocks = nn.ModuleList(blocks) + @can_return_tuple def forward( self, hidden_states: torch.FloatTensor, output_hidden_states: Optional[bool] = False, - return_dict: Optional[bool] = True, ) -> BaseModelOutputWithPoolingAndNoAttention: all_hidden_states = (hidden_states,) if output_hidden_states else None @@ -656,9 +657,6 @@ def forward( if output_hidden_states: all_hidden_states += (hidden_states,) - if not return_dict: - return tuple(v for v in [hidden_states, all_hidden_states] if v is not None) - return BaseModelOutputWithNoAttention( last_hidden_state=hidden_states, hidden_states=all_hidden_states, @@ -1063,6 +1061,7 @@ def __init__(self, config): self.layer = nn.ModuleList([AlignTextLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False + @can_return_tuple def forward( self, hidden_states: torch.Tensor, @@ -1074,8 +1073,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = False, output_hidden_states: Optional[bool] = False, - return_dict: Optional[bool] = True, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: + ) -> BaseModelOutputWithPastAndCrossAttentions: all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None @@ -1128,18 +1126,6 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - if not return_dict: - return tuple( - v - for v in [ - hidden_states, - next_decoder_cache, - all_hidden_states, - all_self_attentions, - all_cross_attentions, - ] - if v is not None - ) return BaseModelOutputWithPastAndCrossAttentions( last_hidden_state=hidden_states, past_key_values=next_decoder_cache, @@ -1220,6 +1206,7 @@ def get_input_embeddings(self): def set_input_embeddings(self, value): self.embeddings.word_embeddings = value + @can_return_tuple @add_start_docstrings_to_model_forward(ALIGN_TEXT_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=BaseModelOutputWithPoolingAndCrossAttentions, config_class=AlignTextConfig) def forward( @@ -1232,8 +1219,7 @@ def forward( inputs_embeds: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPoolingAndCrossAttentions]: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: r""" Returns: @@ -1255,7 +1241,6 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -1298,20 +1283,16 @@ def forward( token_type_ids=token_type_ids, inputs_embeds=inputs_embeds, ) - encoder_outputs = self.encoder( + encoder_outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.encoder( embedding_output, attention_mask=extended_attention_mask, head_mask=head_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = encoder_outputs[0] + sequence_output = encoder_outputs.last_hidden_state pooled_output = self.pooler(sequence_output) if self.pooler is not None else None - if not return_dict: - return (sequence_output, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPoolingAndCrossAttentions( last_hidden_state=sequence_output, pooler_output=pooled_output, @@ -1350,14 +1331,14 @@ def __init__(self, config: AlignVisionConfig): def get_input_embeddings(self) -> nn.Module: return self.vision_model.embeddings.convolution + @can_return_tuple @add_start_docstrings_to_model_forward(ALIGN_VISION_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=BaseModelOutputWithPoolingAndNoAttention, config_class=AlignVisionConfig) def forward( self, pixel_values: Optional[torch.FloatTensor] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPoolingAndNoAttention]: + ) -> BaseModelOutputWithPoolingAndNoAttention: r""" Returns: @@ -1383,26 +1364,21 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if pixel_values is None: raise ValueError("You have to specify pixel_values") embedding_output = self.embeddings(pixel_values) - encoder_outputs = self.encoder( + encoder_outputs: BaseModelOutputWithPoolingAndNoAttention = self.encoder( embedding_output, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) # Apply pooling - last_hidden_state = encoder_outputs[0] + last_hidden_state = encoder_outputs.last_hidden_state pooled_output = self.pooler(last_hidden_state) # Reshape (batch_size, projection_dim, 1 , 1) -> (batch_size, projection_dim) pooled_output = pooled_output.reshape(pooled_output.shape[:2]) - if not return_dict: - return (last_hidden_state, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPoolingAndNoAttention( last_hidden_state=last_hidden_state, pooler_output=pooled_output, @@ -1453,9 +1429,6 @@ def get_text_features( position_ids: Optional[torch.Tensor] = None, head_mask: Optional[torch.Tensor] = None, inputs_embeds: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, ) -> torch.FloatTensor: r""" Returns: @@ -1473,37 +1446,22 @@ def get_text_features( >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt") >>> text_features = model.get_text_features(**inputs) ```""" - # Use ALIGN model's config for some fields (if specified) instead of those of vision & text components. - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - text_outputs = self.text_model( + text_outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.text_model( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + output_attentions=False, + output_hidden_states=False, ) - - last_hidden_state = text_outputs[0][:, 0, :] + last_hidden_state = text_outputs.last_hidden_state[:, 0, :] text_features = self.text_projection(last_hidden_state) - return text_features @add_start_docstrings_to_model_forward(ALIGN_VISION_INPUTS_DOCSTRING) - def get_image_features( - self, - pixel_values: Optional[torch.FloatTensor] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> torch.FloatTensor: + def get_image_features(self, pixel_values: Optional[torch.FloatTensor] = None) -> torch.FloatTensor: r""" Returns: image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by @@ -1526,22 +1484,15 @@ def get_image_features( >>> image_features = model.get_image_features(**inputs) ```""" - # Use ALIGN model's config for some fields (if specified) instead of those of vision & text components. - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - vision_outputs = self.vision_model( + vision_outputs: BaseModelOutputWithPoolingAndNoAttention = self.vision_model( pixel_values=pixel_values, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + output_hidden_states=False, ) - - image_features = vision_outputs[1] # pooled_output - + image_features = vision_outputs.pooler_output return image_features + @can_return_tuple @add_start_docstrings_to_model_forward(ALIGN_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=AlignOutput, config_class=AlignConfig) def forward( @@ -1556,8 +1507,7 @@ def forward( return_loss: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, AlignOutput]: + ) -> AlignOutput: r""" Returns: @@ -1587,15 +1537,13 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - vision_outputs = self.vision_model( + vision_outputs: BaseModelOutputWithPoolingAndNoAttention = self.vision_model( pixel_values=pixel_values, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - text_outputs = self.text_model( + text_outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.text_model( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1604,11 +1552,10 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - image_embeds = vision_outputs[1] - text_embeds = text_outputs[0][:, 0, :] + image_embeds = vision_outputs.pooler_output + text_embeds = text_outputs.last_hidden_state[:, 0, :] text_embeds = self.text_projection(text_embeds) # normalized features @@ -1623,10 +1570,6 @@ def forward( if return_loss: loss = align_loss(logits_per_text) - if not return_dict: - output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs) - return ((loss,) + output) if loss is not None else output - return AlignOutput( loss=loss, logits_per_image=logits_per_image, diff --git a/src/transformers/models/altclip/modeling_altclip.py b/src/transformers/models/altclip/modeling_altclip.py index 9c8be7ea8079..28b2fa7fe288 100755 --- a/src/transformers/models/altclip/modeling_altclip.py +++ b/src/transformers/models/altclip/modeling_altclip.py @@ -16,7 +16,7 @@ import math from dataclasses import dataclass -from typing import Any, Callable, List, Optional, Tuple, Union +from typing import Any, Callable, List, Optional, Tuple import torch import torch.nn as nn @@ -35,6 +35,7 @@ from ...utils import ( ModelOutput, add_start_docstrings_to_model_forward, + can_return_tuple, logging, replace_return_docstrings, torch_int, @@ -625,6 +626,7 @@ def __init__(self, config): self.layer = nn.ModuleList([AltRobertaLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False + @can_return_tuple def forward( self, hidden_states: torch.Tensor, @@ -636,8 +638,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = False, output_hidden_states: Optional[bool] = False, - return_dict: Optional[bool] = True, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: + ) -> BaseModelOutputWithPastAndCrossAttentions: all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None @@ -690,18 +691,6 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - if not return_dict: - return tuple( - v - for v in [ - hidden_states, - next_decoder_cache, - all_hidden_states, - all_self_attentions, - all_cross_attentions, - ] - if v is not None - ) return BaseModelOutputWithPastAndCrossAttentions( last_hidden_state=hidden_states, past_key_values=next_decoder_cache, @@ -910,6 +899,7 @@ def __init__(self, config: AltCLIPConfig): self.layers = nn.ModuleList([AltCLIPEncoderLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False + @can_return_tuple def forward( self, inputs_embeds, @@ -917,8 +907,7 @@ def forward( causal_attention_mask: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutput]: + ) -> BaseModelOutput: r""" Args: inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): @@ -952,7 +941,6 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict encoder_states = () if output_hidden_states else None all_attentions = () if output_attentions else None @@ -985,8 +973,6 @@ def forward( if output_hidden_states: encoder_states = encoder_states + (hidden_states,) - if not return_dict: - return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) return BaseModelOutput( last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions ) @@ -1144,6 +1130,7 @@ def __init__(self, config: AltCLIPVisionConfig): self.encoder = AltCLIPEncoder(config) self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) + @can_return_tuple @add_start_docstrings_to_model_forward(ALTCLIP_VISION_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=AltCLIPVisionConfig) def forward( @@ -1151,9 +1138,8 @@ def forward( pixel_values: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, interpolate_pos_encoding: Optional[bool] = False, - ) -> Union[Tuple, BaseModelOutputWithPooling]: + ) -> BaseModelOutputWithPooling: r""" Returns: @@ -1162,7 +1148,6 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -1170,20 +1155,16 @@ def forward( hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding) hidden_states = self.pre_layrnorm(hidden_states) - encoder_outputs = self.encoder( + encoder_outputs: BaseModelOutput = self.encoder( inputs_embeds=hidden_states, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - last_hidden_state = encoder_outputs[0] + last_hidden_state = encoder_outputs.last_hidden_state pooled_output = last_hidden_state[:, 0, :] pooled_output = self.post_layernorm(pooled_output) - if not return_dict: - return (last_hidden_state, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPooling( last_hidden_state=last_hidden_state, pooler_output=pooled_output, @@ -1205,6 +1186,7 @@ def __init__(self, config: AltCLIPVisionConfig): def get_input_embeddings(self) -> nn.Module: return self.vision_model.embeddings.patch_embedding + @can_return_tuple @add_start_docstrings_to_model_forward(ALTCLIP_VISION_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=AltCLIPVisionConfig) def forward( @@ -1213,8 +1195,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, interpolate_pos_encoding: bool = False, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPooling]: + ) -> BaseModelOutputWithPooling: r""" Returns: @@ -1237,14 +1218,11 @@ def forward( >>> last_hidden_state = outputs.last_hidden_state >>> pooled_output = outputs.pooler_output # pooled CLS states ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - return self.vision_model( pixel_values=pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, interpolate_pos_encoding=interpolate_pos_encoding, - return_dict=return_dict, ) @@ -1308,8 +1286,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: r""" encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if @@ -1334,7 +1311,6 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if self.config.is_decoder: use_cache = use_cache if use_cache is not None else self.config.use_cache @@ -1397,7 +1373,7 @@ def forward( inputs_embeds=inputs_embeds, past_key_values_length=past_key_values_length, ) - encoder_outputs = self.encoder( + encoder_outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.encoder( embedding_output, attention_mask=extended_attention_mask, head_mask=head_mask, @@ -1407,14 +1383,10 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = encoder_outputs[0] + sequence_output = encoder_outputs.last_hidden_state pooled_output = self.pooler(sequence_output) if self.pooler is not None else None - if not return_dict: - return (sequence_output, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPoolingAndCrossAttentions( last_hidden_state=sequence_output, pooler_output=pooled_output, @@ -1444,6 +1416,7 @@ def set_input_embeddings(self, value: nn.Embedding) -> None: def resize_token_embeddings(self, new_num_tokens: Optional[int] = None) -> nn.Embedding: return super().resize_token_embeddings(new_num_tokens) + @can_return_tuple @add_start_docstrings_to_model_forward(ALTCLIP_TEXT_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=BaseModelOutputWithPoolingAndProjection, config_class=AltCLIPTextConfig) def forward( @@ -1457,9 +1430,8 @@ def forward( encoder_hidden_states: Optional[torch.Tensor] = None, encoder_attention_mask: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, - return_dict: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPoolingAndProjection]: + ) -> BaseModelOutputWithPoolingAndProjection: r""" Returns: @@ -1479,10 +1451,7 @@ def forward( >>> last_hidden_state = outputs.last_hidden_state >>> pooled_output = outputs.pooler_output # pooled CLS states ```""" - - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.roberta( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roberta( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1493,11 +1462,10 @@ def forward( encoder_attention_mask=encoder_attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) # last module outputs - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state # project every module sequence_output = self.pre_LN(sequence_output) @@ -1506,9 +1474,6 @@ def forward( projection_state = self.transformation(sequence_output) pooler_output = projection_state[:, 0] - if not return_dict: - return (projection_state, pooler_output) + outputs[2:4] - return BaseModelOutputWithPoolingAndProjection( last_hidden_state=projection_state, pooler_output=pooler_output, @@ -1558,9 +1523,6 @@ def get_text_features( attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.Tensor] = None, token_type_ids=None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, ) -> torch.FloatTensor: r""" Returns: @@ -1577,23 +1539,15 @@ def get_text_features( >>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt") >>> text_features = model.get_text_features(**inputs) ```""" - # Use AltCLIP model's config for some fields (if specified) instead of those of vision & text components. - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - text_outputs = self.text_model( + text_outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.text_model( input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, token_type_ids=token_type_ids, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + output_attentions=False, + output_hidden_states=False, ) - pooled_output = text_outputs[1] + pooled_output = text_outputs.pooler_output text_features = self.text_projection(pooled_output) return text_features @@ -1602,10 +1556,7 @@ def get_text_features( def get_image_features( self, pixel_values: Optional[torch.FloatTensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, interpolate_pos_encoding: bool = False, - return_dict: Optional[bool] = None, ) -> torch.FloatTensor: r""" Returns: @@ -1626,26 +1577,17 @@ def get_image_features( >>> inputs = processor(images=image, return_tensors="pt") >>> image_features = model.get_image_features(**inputs) ```""" - # Use AltCLIP model's config for some fields (if specified) instead of those of vision & text components. - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - vision_outputs = self.vision_model( + vision_outputs: BaseModelOutputWithPooling = self.vision_model( pixel_values=pixel_values, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, + output_attentions=False, + output_hidden_states=False, interpolate_pos_encoding=interpolate_pos_encoding, - return_dict=return_dict, ) - - pooled_output = vision_outputs[1] # pooled_output + pooled_output = vision_outputs.pooler_output image_features = self.visual_projection(pooled_output) - return image_features + @can_return_tuple @add_start_docstrings_to_model_forward(ALTCLIP_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=AltCLIPOutput, config_class=AltCLIPConfig) def forward( @@ -1659,8 +1601,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, interpolate_pos_encoding: bool = False, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, AltCLIPOutput]: + ) -> AltCLIPOutput: r""" Returns: @@ -1687,30 +1628,27 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - text_outputs = self.text_model( + text_outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.text_model( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - vision_outputs = self.vision_model( + vision_outputs: BaseModelOutputWithPooling = self.vision_model( pixel_values=pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, interpolate_pos_encoding=interpolate_pos_encoding, - return_dict=return_dict, ) - image_embeds = vision_outputs[1] + image_embeds = vision_outputs.pooler_output image_embeds = self.visual_projection(image_embeds) - text_embeds = text_outputs[1] + text_embeds = text_outputs.pooler_output text_embeds = self.text_projection(text_embeds) # normalized features @@ -1726,10 +1664,6 @@ def forward( if return_loss: loss = clip_loss(logits_per_text) - if not return_dict: - output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs) - return ((loss,) + output) if loss is not None else output - return AltCLIPOutput( loss=loss, logits_per_image=logits_per_image, diff --git a/src/transformers/models/bert/modeling_bert.py b/src/transformers/models/bert/modeling_bert.py index 3a311f4a734c..229d591647f7 100755 --- a/src/transformers/models/bert/modeling_bert.py +++ b/src/transformers/models/bert/modeling_bert.py @@ -19,7 +19,7 @@ import os import warnings from dataclasses import dataclass -from typing import List, Optional, Tuple, Union +from typing import List, Optional, Tuple import torch import torch.utils.checkpoint @@ -51,6 +51,7 @@ add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, + can_return_tuple, get_torch_version, logging, replace_return_docstrings, @@ -648,6 +649,7 @@ def __init__(self, config): self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False + @can_return_tuple def forward( self, hidden_states: torch.Tensor, @@ -659,8 +661,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = False, output_hidden_states: Optional[bool] = False, - return_dict: Optional[bool] = True, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: + ) -> BaseModelOutputWithPastAndCrossAttentions: all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None @@ -713,18 +714,6 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - if not return_dict: - return tuple( - v - for v in [ - hidden_states, - next_decoder_cache, - all_hidden_states, - all_self_attentions, - all_cross_attentions, - ] - if v is not None - ) return BaseModelOutputWithPastAndCrossAttentions( last_hidden_state=hidden_states, past_key_values=next_decoder_cache, @@ -1000,6 +989,7 @@ class PreTrainedModel for layer, heads in heads_to_prune.items(): self.encoder.layer[layer].attention.prune_heads(heads) + @can_return_tuple @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -1020,8 +1010,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: r""" encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if @@ -1046,7 +1035,6 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if self.config.is_decoder: use_cache = use_cache if use_cache is not None else self.config.use_cache @@ -1141,7 +1129,7 @@ def forward( # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) - encoder_outputs = self.encoder( + encoder_outputs: BaseModelOutputWithPastAndCrossAttentions = self.encoder( embedding_output, attention_mask=extended_attention_mask, head_mask=head_mask, @@ -1151,14 +1139,10 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = encoder_outputs[0] + sequence_output = encoder_outputs.last_hidden_state pooled_output = self.pooler(sequence_output) if self.pooler is not None else None - if not return_dict: - return (sequence_output, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPoolingAndCrossAttentions( last_hidden_state=sequence_output, pooler_output=pooled_output, @@ -1195,6 +1179,7 @@ def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings self.cls.predictions.bias = new_embeddings.bias + @can_return_tuple @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=BertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) def forward( @@ -1209,8 +1194,7 @@ def forward( next_sentence_label: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], BertForPreTrainingOutput]: + ) -> BertForPreTrainingOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., @@ -1243,9 +1227,8 @@ def forward( >>> seq_relationship_logits = outputs.seq_relationship_logits ``` """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.bert( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1254,10 +1237,10 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output, pooled_output = outputs[:2] + sequence_output = outputs.last_hidden_state + pooled_output = outputs.pooler_output prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output) total_loss = None @@ -1267,10 +1250,6 @@ def forward( next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) total_loss = masked_lm_loss + next_sentence_loss - if not return_dict: - output = (prediction_scores, seq_relationship_score) + outputs[2:] - return ((total_loss,) + output) if total_loss is not None else output - return BertForPreTrainingOutput( loss=total_loss, prediction_logits=prediction_scores, @@ -1305,6 +1284,7 @@ def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings self.cls.predictions.bias = new_embeddings.bias + @can_return_tuple @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -1326,9 +1306,8 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, **loss_kwargs, - ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]: + ) -> CausalLMOutputWithCrossAttentions: r""" encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if @@ -1353,11 +1332,10 @@ def forward( If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see `past_key_values`). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if labels is not None: use_cache = False - outputs = self.bert( + outputs: BaseModelOutputWithPastAndCrossAttentions = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1370,20 +1348,15 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state prediction_scores = self.cls(sequence_output) lm_loss = None if labels is not None: lm_loss = self.loss_function(prediction_scores, labels, self.config.vocab_size, **loss_kwargs) - if not return_dict: - output = (prediction_scores,) + outputs[2:] - return ((lm_loss,) + output) if lm_loss is not None else output - return CausalLMOutputWithCrossAttentions( loss=lm_loss, logits=prediction_scores, @@ -1428,6 +1401,7 @@ def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings self.cls.predictions.bias = new_embeddings.bias + @can_return_tuple @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -1449,8 +1423,7 @@ def forward( labels: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]: + ) -> MaskedLMOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., @@ -1458,9 +1431,7 @@ def forward( loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.bert( + outputs: BaseModelOutputWithPastAndCrossAttentions = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1471,10 +1442,9 @@ def forward( encoder_attention_mask=encoder_attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state prediction_scores = self.cls(sequence_output) masked_lm_loss = None @@ -1482,10 +1452,6 @@ def forward( loss_fct = CrossEntropyLoss() # -100 index = padding token masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) - if not return_dict: - output = (prediction_scores,) + outputs[2:] - return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output - return MaskedLMOutput( loss=masked_lm_loss, logits=prediction_scores, @@ -1532,6 +1498,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=NextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC) def forward( @@ -1545,9 +1512,8 @@ def forward( labels: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, **kwargs, - ) -> Union[Tuple[torch.Tensor], NextSentencePredictorOutput]: + ) -> NextSentencePredictorOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair @@ -1585,9 +1551,7 @@ def forward( ) labels = kwargs.pop("next_sentence_label") - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.bert( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1596,11 +1560,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - pooled_output = outputs[1] - + pooled_output = outputs.pooler_output seq_relationship_scores = self.cls(pooled_output) next_sentence_loss = None @@ -1608,10 +1570,6 @@ def forward( loss_fct = CrossEntropyLoss() next_sentence_loss = loss_fct(seq_relationship_scores.view(-1, 2), labels.view(-1)) - if not return_dict: - output = (seq_relationship_scores,) + outputs[2:] - return ((next_sentence_loss,) + output) if next_sentence_loss is not None else output - return NextSentencePredictorOutput( loss=next_sentence_loss, logits=seq_relationship_scores, @@ -1643,6 +1601,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION, @@ -1662,17 +1621,14 @@ def forward( labels: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]: + ) -> SequenceClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.bert( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1681,11 +1637,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - pooled_output = outputs[1] - + pooled_output = outputs.pooler_output pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) @@ -1711,9 +1665,6 @@ def forward( elif self.config.problem_type == "multi_label_classification": loss_fct = BCEWithLogitsLoss() loss = loss_fct(logits, labels) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output return SequenceClassifierOutput( loss=loss, @@ -1744,6 +1695,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -1761,15 +1713,13 @@ def forward( labels: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]: + ) -> MultipleChoiceModelOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above) """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None @@ -1782,7 +1732,7 @@ def forward( else None ) - outputs = self.bert( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1791,11 +1741,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - pooled_output = outputs[1] - + pooled_output = outputs.pooler_output pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) reshaped_logits = logits.view(-1, num_choices) @@ -1805,10 +1753,6 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(reshaped_logits, labels) - if not return_dict: - output = (reshaped_logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return MultipleChoiceModelOutput( loss=loss, logits=reshaped_logits, @@ -1839,6 +1783,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_TOKEN_CLASSIFICATION, @@ -1858,15 +1803,12 @@ def forward( labels: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]: + ) -> TokenClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.bert( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1875,11 +1817,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] - + sequence_output = outputs.last_hidden_state sequence_output = self.dropout(sequence_output) logits = self.classifier(sequence_output) @@ -1888,10 +1828,6 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return TokenClassifierOutput( loss=loss, logits=logits, @@ -1918,6 +1854,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_QA, @@ -1940,8 +1877,7 @@ def forward( end_positions: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]: + ) -> QuestionAnsweringModelOutput: r""" start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for position (index) of the start of the labelled span for computing the token classification loss. @@ -1952,9 +1888,7 @@ def forward( Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.bert( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1963,11 +1897,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] - + sequence_output = outputs.last_hidden_state logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) start_logits = start_logits.squeeze(-1).contiguous() @@ -1990,10 +1922,6 @@ def forward( end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 - if not return_dict: - output = (start_logits, end_logits) + outputs[2:] - return ((total_loss,) + output) if total_loss is not None else output - return QuestionAnsweringModelOutput( loss=total_loss, start_logits=start_logits, diff --git a/src/transformers/models/bert_generation/modeling_bert_generation.py b/src/transformers/models/bert_generation/modeling_bert_generation.py index 75fd1b17168e..ea1c1843de97 100755 --- a/src/transformers/models/bert_generation/modeling_bert_generation.py +++ b/src/transformers/models/bert_generation/modeling_bert_generation.py @@ -15,7 +15,7 @@ """PyTorch BERT model specific for generation.""" import math -from typing import Optional, Tuple, Union +from typing import Optional, Tuple import torch import torch.utils.checkpoint @@ -30,6 +30,7 @@ add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, + can_return_tuple, logging, replace_return_docstrings, ) @@ -367,7 +368,7 @@ def feed_forward_chunk(self, attention_output): return layer_output -# Copied from transformers.models.bert.modeling_bert.BertEncoder with Bert->BertGeneration +# Copied from transformers.models.bert.modeling_bert.BertEncoder with BertLayer->BertGenerationLayer class BertEncoder(nn.Module): def __init__(self, config): super().__init__() @@ -375,6 +376,7 @@ def __init__(self, config): self.layer = nn.ModuleList([BertGenerationLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False + @can_return_tuple def forward( self, hidden_states: torch.Tensor, @@ -386,8 +388,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = False, output_hidden_states: Optional[bool] = False, - return_dict: Optional[bool] = True, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: + ) -> BaseModelOutputWithPastAndCrossAttentions: all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None @@ -440,18 +441,6 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - if not return_dict: - return tuple( - v - for v in [ - hidden_states, - next_decoder_cache, - all_hidden_states, - all_self_attentions, - all_cross_attentions, - ] - if v is not None - ) return BaseModelOutputWithPastAndCrossAttentions( last_hidden_state=hidden_states, past_key_values=next_decoder_cache, @@ -715,6 +704,7 @@ class PreTrainedModel for layer, heads in heads_to_prune.items(): self.encoder.layer[layer].attention.prune_heads(heads) + @can_return_tuple @add_start_docstrings_to_model_forward(BERT_GENERATION_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -734,9 +724,8 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, **kwargs, # NOOP kwargs, for now - ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]: + ) -> BaseModelOutputWithPastAndCrossAttentions: r""" encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if @@ -759,7 +748,6 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if self.config.is_decoder: use_cache = use_cache if use_cache is not None else self.config.use_cache @@ -814,7 +802,7 @@ def forward( past_key_values_length=past_key_values_length, ) - encoder_outputs = self.encoder( + outputs: BaseModelOutputWithPastAndCrossAttentions = self.encoder( embedding_output, attention_mask=extended_attention_mask, head_mask=head_mask, @@ -824,20 +812,9 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = encoder_outputs[0] - if not return_dict: - return (sequence_output,) + encoder_outputs[1:] - - return BaseModelOutputWithPastAndCrossAttentions( - last_hidden_state=sequence_output, - past_key_values=encoder_outputs.past_key_values, - hidden_states=encoder_outputs.hidden_states, - attentions=encoder_outputs.attentions, - cross_attentions=encoder_outputs.cross_attentions, - ) + return outputs class BertGenerationOnlyLMHead(nn.Module): @@ -886,6 +863,7 @@ def set_output_embeddings(self, new_embeddings): self.lm_head.decoder = new_embeddings self.lm_head.bias = new_embeddings.bias + @can_return_tuple @add_start_docstrings_to_model_forward(BERT_GENERATION_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC) def forward( @@ -902,9 +880,8 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, **kwargs, - ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]: + ) -> CausalLMOutputWithCrossAttentions: r""" encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if @@ -949,11 +926,10 @@ def forward( >>> prediction_logits = outputs.logits ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if labels is not None: use_cache = False - outputs = self.bert( + outputs: BaseModelOutputWithPastAndCrossAttentions = self.bert( input_ids, attention_mask=attention_mask, position_ids=position_ids, @@ -965,11 +941,10 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, **kwargs, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state prediction_scores = self.lm_head(sequence_output) lm_loss = None @@ -981,10 +956,6 @@ def forward( **kwargs, ) - if not return_dict: - output = (prediction_scores,) + outputs[1:] - return ((lm_loss,) + output) if lm_loss is not None else output - return CausalLMOutputWithCrossAttentions( loss=lm_loss, logits=prediction_scores, diff --git a/src/transformers/models/bridgetower/modeling_bridgetower.py b/src/transformers/models/bridgetower/modeling_bridgetower.py index 524b4caa7431..21bb7c9e09d2 100644 --- a/src/transformers/models/bridgetower/modeling_bridgetower.py +++ b/src/transformers/models/bridgetower/modeling_bridgetower.py @@ -37,6 +37,7 @@ from ...utils import ( add_start_docstrings, add_start_docstrings_to_model_forward, + can_return_tuple, logging, replace_return_docstrings, torch_int, @@ -837,6 +838,7 @@ def __init__(self, config): self.layer = nn.ModuleList([BridgeTowerTextLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False + @can_return_tuple def forward( self, hidden_states: torch.Tensor, @@ -848,8 +850,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = False, output_hidden_states: Optional[bool] = False, - return_dict: Optional[bool] = True, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: + ) -> BaseModelOutputWithPastAndCrossAttentions: all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None @@ -902,18 +903,6 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - if not return_dict: - return tuple( - v - for v in [ - hidden_states, - next_decoder_cache, - all_hidden_states, - all_self_attentions, - all_cross_attentions, - ] - if v is not None - ) return BaseModelOutputWithPastAndCrossAttentions( last_hidden_state=hidden_states, past_key_values=next_decoder_cache, @@ -1143,8 +1132,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: r""" encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if @@ -1169,7 +1157,6 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if self.config.is_decoder: use_cache = use_cache if use_cache is not None else self.config.use_cache @@ -1232,7 +1219,7 @@ def forward( inputs_embeds=inputs_embeds, past_key_values_length=past_key_values_length, ) - encoder_outputs = self.encoder( + encoder_outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.encoder( embedding_output, attention_mask=extended_attention_mask, head_mask=head_mask, @@ -1242,14 +1229,10 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = encoder_outputs[0] + sequence_output = encoder_outputs.last_hidden_state pooled_output = self.pooler(sequence_output) if self.pooler is not None else None - if not return_dict: - return (sequence_output, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPoolingAndCrossAttentions( last_hidden_state=sequence_output, pooler_output=pooled_output, @@ -1328,6 +1311,7 @@ def get_input_embeddings(self): def set_input_embeddings(self, value): self.text_model.set_input_embeddings(value) + @can_return_tuple @add_start_docstrings_to_model_forward(BRIDGETOWER_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=BridgeTowerModelOutput, config_class=_CONFIG_FOR_DOC) def forward( @@ -1343,10 +1327,9 @@ def forward( image_token_type_idx: Optional[int] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, labels: Optional[torch.LongTensor] = None, interpolate_pos_encoding: bool = False, - ) -> Union[Tuple[torch.Tensor], BridgeTowerModelOutput]: + ) -> BridgeTowerModelOutput: r""" output_hidden_states (`bool`, *optional*): If set to `True`, hidden states are returned as a list containing the hidden states of text, image, and @@ -1393,7 +1376,6 @@ def forward( "BridgeTowerModel does not use `inputs_embeds`. Make sure to pass in `input_ids` instead." ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict image_token_type_idx = image_token_type_idx if image_token_type_idx else 1 input_shape = input_ids.size() text_embeds = self.text_model.embeddings(input_ids=input_ids) @@ -1547,13 +1529,6 @@ def forward( if output_hidden_states: all_hidden_states = (all_hidden_states_text, all_hidden_states_image, all_hidden_states_cross) - if not return_dict: - return tuple( - v - for v in [text_features, image_features, cls_features, all_hidden_states, all_self_attentions] - if v is not None - ) - return BridgeTowerModelOutput( text_features=text_features, image_features=image_features, @@ -1636,6 +1611,7 @@ def get_output_embeddings(self): def set_output_embeddings(self, new_embeddings): self.mlm_score.decoder = new_embeddings + @can_return_tuple @add_start_docstrings_to_model_forward(BRIDGETOWER_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC) def forward( @@ -1650,7 +1626,6 @@ def forward( image_embeds: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, labels: Optional[torch.LongTensor] = None, ) -> Union[MaskedLMOutput, Tuple[torch.FloatTensor]]: r""" @@ -1685,8 +1660,7 @@ def forward( >>> print(results) .a cat looking out of the window. ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.bridgetower( + outputs: BridgeTowerModelOutput = self.bridgetower( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1697,10 +1671,9 @@ def forward( image_embeds=image_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - mlm_logits = self.mlm_score(outputs.text_features if return_dict else outputs[0]) + mlm_logits = self.mlm_score(outputs.text_features) masked_lm_loss = None if labels is not None: loss_fct = CrossEntropyLoss() # -100 index = padding token @@ -1708,10 +1681,6 @@ def forward( labels = labels.to(mlm_logits.device) masked_lm_loss = loss_fct(mlm_logits.view(-1, self.config.text_config.vocab_size), labels.view(-1)) - if not return_dict: - output = tuple(mlm_logits) - return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output - return MaskedLMOutput( loss=masked_lm_loss, logits=mlm_logits, @@ -1738,6 +1707,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(BRIDGETOWER_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=SequenceClassifierOutput, config_class=_CONFIG_FOR_DOC) def forward( @@ -1752,7 +1722,6 @@ def forward( image_embeds: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, labels: Optional[torch.LongTensor] = None, ) -> Union[SequenceClassifierOutput, Tuple[torch.FloatTensor]]: r""" @@ -1783,9 +1752,8 @@ def forward( ... outputs = model(**encoding) ... scores[text] = outputs.logits[0, 1].item() ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.bridgetower( + outputs: BridgeTowerModelOutput = self.bridgetower( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1796,11 +1764,9 @@ def forward( image_embeds=image_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - pooler_output = outputs.pooler_output if return_dict else outputs[2] - + pooler_output = outputs.pooler_output logits = self.itm_score(pooler_output) itm_loss = None @@ -1810,10 +1776,6 @@ def forward( labels = labels.to(logits.device) itm_loss = loss_fct(logits, labels) - if not return_dict: - output = tuple(logits) - return ((itm_loss,) + output) if itm_loss is not None else output - return SequenceClassifierOutput( loss=itm_loss, logits=logits, @@ -1852,6 +1814,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(BRIDGETOWER_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=BridgeTowerContrastiveOutput, config_class=_CONFIG_FOR_DOC) def forward( @@ -1866,7 +1829,6 @@ def forward( image_embeds: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = True, - return_dict: Optional[bool] = None, return_loss: Optional[bool] = None, ) -> Union[BridgeTowerContrastiveOutput, Tuple[torch.FloatTensor]]: r""" @@ -1904,9 +1866,8 @@ def forward( >>> print("Loss with swapped images", round(loss_swapped.item(), 4)) Loss with swapped images 2.126 ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.bridgetower( + outputs: BridgeTowerModelOutput = self.bridgetower( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1917,13 +1878,9 @@ def forward( image_embeds=image_embeds, output_attentions=output_attentions, output_hidden_states=True, - return_dict=return_dict, - ) - - pooler_output = outputs.pooler_output if return_dict else outputs[2] - hidden_states_txt, hidden_states_img, hidden_states_cross_modal = ( - outputs.hidden_states if return_dict else outputs[3] ) + pooler_output = outputs.pooler_output + hidden_states_txt, hidden_states_img, hidden_states_cross_modal = outputs.hidden_states text_embeds = hidden_states_txt[-1] image_embeds = hidden_states_img[-1] @@ -1960,10 +1917,6 @@ def forward( image_to_cross_loss = nn.functional.cross_entropy(logits_image_to_cross, labels) itc_loss = (text_to_image_loss + text_to_cross_loss + image_to_cross_loss) / 3.0 - if not return_dict: - output = (logits, text_embeds, image_embeds, cross_embeds) + outputs[3:] - return ((itc_loss,) + output) if itc_loss is not None else output - return BridgeTowerContrastiveOutput( loss=itc_loss, logits=logits, diff --git a/src/transformers/models/camembert/modeling_camembert.py b/src/transformers/models/camembert/modeling_camembert.py index b69590ae21a5..0678a3cf7c26 100644 --- a/src/transformers/models/camembert/modeling_camembert.py +++ b/src/transformers/models/camembert/modeling_camembert.py @@ -16,7 +16,7 @@ """PyTorch CamemBERT model.""" import math -from typing import List, Optional, Tuple, Union +from typing import List, Optional, Tuple import torch import torch.utils.checkpoint @@ -46,6 +46,7 @@ add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, + can_return_tuple, get_torch_version, logging, replace_return_docstrings, @@ -602,6 +603,7 @@ def __init__(self, config): self.layer = nn.ModuleList([CamembertLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False + @can_return_tuple def forward( self, hidden_states: torch.Tensor, @@ -613,8 +615,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = False, output_hidden_states: Optional[bool] = False, - return_dict: Optional[bool] = True, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: + ) -> BaseModelOutputWithPastAndCrossAttentions: all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None @@ -667,18 +668,6 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - if not return_dict: - return tuple( - v - for v in [ - hidden_states, - next_decoder_cache, - all_hidden_states, - all_self_attentions, - all_cross_attentions, - ] - if v is not None - ) return BaseModelOutputWithPastAndCrossAttentions( last_hidden_state=hidden_states, past_key_values=next_decoder_cache, @@ -913,8 +902,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: r""" encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if @@ -939,7 +927,6 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if self.config.is_decoder: use_cache = use_cache if use_cache is not None else self.config.use_cache @@ -1034,7 +1021,7 @@ def forward( # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) - encoder_outputs = self.encoder( + encoder_outputs: BaseModelOutputWithPastAndCrossAttentions = self.encoder( embedding_output, attention_mask=extended_attention_mask, head_mask=head_mask, @@ -1044,14 +1031,10 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = encoder_outputs[0] + sequence_output = encoder_outputs.last_hidden_state pooled_output = self.pooler(sequence_output) if self.pooler is not None else None - if not return_dict: - return (sequence_output, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPoolingAndCrossAttentions( last_hidden_state=sequence_output, pooler_output=pooled_output, @@ -1091,6 +1074,7 @@ def get_output_embeddings(self): def set_output_embeddings(self, new_embeddings): self.lm_head.decoder = new_embeddings + @can_return_tuple @add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -1113,8 +1097,7 @@ def forward( labels: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]: + ) -> MaskedLMOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., @@ -1123,9 +1106,7 @@ def forward( kwargs (`Dict[str, any]`, *optional*, defaults to `{}`): Used to hide legacy arguments that have been deprecated. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.roberta( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1136,9 +1117,8 @@ def forward( encoder_attention_mask=encoder_attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state prediction_scores = self.lm_head(sequence_output) masked_lm_loss = None @@ -1148,10 +1128,6 @@ def forward( loss_fct = CrossEntropyLoss() masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) - if not return_dict: - output = (prediction_scores,) + outputs[2:] - return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output - return MaskedLMOutput( loss=masked_lm_loss, logits=prediction_scores, @@ -1180,6 +1156,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint="cardiffnlp/twitter-roberta-base-emotion", @@ -1199,17 +1176,14 @@ def forward( labels: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]: + ) -> SequenceClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.roberta( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1218,9 +1192,8 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state logits = self.classifier(sequence_output) loss = None @@ -1248,10 +1221,6 @@ def forward( loss_fct = BCEWithLogitsLoss() loss = loss_fct(logits, labels) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return SequenceClassifierOutput( loss=loss, logits=logits, @@ -1279,6 +1248,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward( CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length") ) @@ -1298,15 +1268,13 @@ def forward( inputs_embeds: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]: + ) -> MultipleChoiceModelOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above) """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None @@ -1319,7 +1287,7 @@ def forward( else None ) - outputs = self.roberta( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roberta( flat_input_ids, position_ids=flat_position_ids, token_type_ids=flat_token_type_ids, @@ -1328,9 +1296,8 @@ def forward( inputs_embeds=flat_inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - pooled_output = outputs[1] + pooled_output = outputs.pooler_output pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) @@ -1343,10 +1310,6 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(reshaped_logits, labels) - if not return_dict: - output = (reshaped_logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return MultipleChoiceModelOutput( loss=loss, logits=reshaped_logits, @@ -1378,6 +1341,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint="Jean-Baptiste/roberta-large-ner-english", @@ -1397,15 +1361,12 @@ def forward( labels: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]: + ) -> TokenClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.roberta( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1414,11 +1375,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] - + sequence_output = outputs.last_hidden_state sequence_output = self.dropout(sequence_output) logits = self.classifier(sequence_output) @@ -1429,10 +1388,6 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return TokenClassifierOutput( loss=loss, logits=logits, @@ -1460,6 +1415,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint="deepset/roberta-base-squad2", @@ -1480,8 +1436,7 @@ def forward( end_positions: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]: + ) -> QuestionAnsweringModelOutput: r""" start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for position (index) of the start of the labelled span for computing the token classification loss. @@ -1492,9 +1447,7 @@ def forward( Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.roberta( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1503,10 +1456,8 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) @@ -1530,10 +1481,6 @@ def forward( end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 - if not return_dict: - output = (start_logits, end_logits) + outputs[2:] - return ((total_loss,) + output) if total_loss is not None else output - return QuestionAnsweringModelOutput( loss=total_loss, start_logits=start_logits, @@ -1568,6 +1515,7 @@ def get_output_embeddings(self): def set_output_embeddings(self, new_embeddings): self.lm_head.decoder = new_embeddings + @can_return_tuple @add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC) def forward( @@ -1585,9 +1533,8 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, **kwargs, - ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]: + ) -> CausalLMOutputWithCrossAttentions: r""" encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if @@ -1631,11 +1578,10 @@ def forward( >>> prediction_logits = outputs.logits ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if labels is not None: use_cache = False - outputs = self.roberta( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1648,10 +1594,9 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state prediction_scores = self.lm_head(sequence_output) lm_loss = None @@ -1665,10 +1610,6 @@ def forward( **kwargs, ) - if not return_dict: - output = (prediction_scores,) + outputs[2:] - return ((lm_loss,) + output) if lm_loss is not None else output - return CausalLMOutputWithCrossAttentions( loss=lm_loss, logits=prediction_scores, diff --git a/src/transformers/models/chinese_clip/modeling_chinese_clip.py b/src/transformers/models/chinese_clip/modeling_chinese_clip.py index 647e8f1c2421..b4fffdc74e97 100644 --- a/src/transformers/models/chinese_clip/modeling_chinese_clip.py +++ b/src/transformers/models/chinese_clip/modeling_chinese_clip.py @@ -16,7 +16,7 @@ import math from dataclasses import dataclass -from typing import Any, List, Optional, Tuple, Union +from typing import Any, List, Optional, Tuple import torch import torch.utils.checkpoint @@ -36,6 +36,7 @@ add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, + can_return_tuple, logging, replace_return_docstrings, torch_int, @@ -924,6 +925,7 @@ def __init__(self, config): self.layer = nn.ModuleList([ChineseCLIPTextLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False + @can_return_tuple def forward( self, hidden_states: torch.Tensor, @@ -935,8 +937,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = False, output_hidden_states: Optional[bool] = False, - return_dict: Optional[bool] = True, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: + ) -> BaseModelOutputWithPastAndCrossAttentions: all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None @@ -989,18 +990,6 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - if not return_dict: - return tuple( - v - for v in [ - hidden_states, - next_decoder_cache, - all_hidden_states, - all_self_attentions, - all_cross_attentions, - ] - if v is not None - ) return BaseModelOutputWithPastAndCrossAttentions( last_hidden_state=hidden_states, past_key_values=next_decoder_cache, @@ -1025,13 +1014,13 @@ def __init__(self, config: ChineseCLIPConfig): self.layers = nn.ModuleList([ChineseCLIPVisionLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False + @can_return_tuple def forward( self, inputs_embeds, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutput]: + ) -> BaseModelOutput: r""" Args: inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): @@ -1051,7 +1040,6 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict encoder_states = () if output_hidden_states else None all_attentions = () if output_attentions else None @@ -1080,8 +1068,6 @@ def forward( if output_hidden_states: encoder_states = encoder_states + (hidden_states,) - if not return_dict: - return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) return BaseModelOutput( last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions ) @@ -1098,6 +1084,7 @@ def __init__(self, config: ChineseCLIPVisionConfig): self.encoder = ChineseCLIPVisionEncoder(config) self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) + @can_return_tuple @add_start_docstrings_to_model_forward(CHINESE_CLIP_VISION_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=ChineseCLIPVisionConfig) def forward( @@ -1106,8 +1093,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, interpolate_pos_encoding: bool = False, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPooling]: + ) -> BaseModelOutputWithPooling: r""" Returns: """ @@ -1115,7 +1101,6 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -1123,20 +1108,16 @@ def forward( hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding) hidden_states = self.pre_layrnorm(hidden_states) - encoder_outputs = self.encoder( + encoder_outputs: BaseModelOutput = self.encoder( inputs_embeds=hidden_states, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - last_hidden_state = encoder_outputs[0] + last_hidden_state = encoder_outputs.last_hidden_state pooled_output = last_hidden_state[:, 0, :] pooled_output = self.post_layernorm(pooled_output) - if not return_dict: - return (last_hidden_state, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPooling( last_hidden_state=last_hidden_state, pooler_output=pooled_output, @@ -1191,6 +1172,7 @@ class PreTrainedModel for layer, heads in heads_to_prune.items(): self.encoder.layer[layer].attention.prune_heads(heads) + @can_return_tuple @add_start_docstrings_to_model_forward(CHINESE_CLIP_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -1211,8 +1193,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: r""" encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if @@ -1237,7 +1218,6 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if self.config.is_decoder: use_cache = use_cache if use_cache is not None else self.config.use_cache @@ -1300,7 +1280,7 @@ def forward( inputs_embeds=inputs_embeds, past_key_values_length=past_key_values_length, ) - encoder_outputs = self.encoder( + encoder_outputs: BaseModelOutputWithPastAndCrossAttentions = self.encoder( embedding_output, attention_mask=extended_attention_mask, head_mask=head_mask, @@ -1310,14 +1290,10 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = encoder_outputs[0] + sequence_output = encoder_outputs.last_hidden_state pooled_output = self.pooler(sequence_output) if self.pooler is not None else None - if not return_dict: - return (sequence_output, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPoolingAndCrossAttentions( last_hidden_state=sequence_output, pooler_output=pooled_output, @@ -1346,6 +1322,7 @@ def __init__(self, config: ChineseCLIPVisionConfig): def get_input_embeddings(self) -> nn.Module: return self.vision_model.embeddings.patch_embedding + @can_return_tuple @add_start_docstrings_to_model_forward(CHINESE_CLIP_VISION_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=ChineseCLIPVisionConfig) def forward( @@ -1354,8 +1331,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, interpolate_pos_encoding: bool = False, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPooling]: + ) -> BaseModelOutputWithPooling: r""" Returns: @@ -1378,14 +1354,11 @@ def forward( >>> last_hidden_state = outputs.last_hidden_state >>> pooled_output = outputs.pooler_output # pooled CLS states ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - return self.vision_model( pixel_values=pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, interpolate_pos_encoding=interpolate_pos_encoding, - return_dict=return_dict, ) @@ -1425,6 +1398,7 @@ def __init__(self, config: ChineseCLIPConfig): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(CHINESE_CLIP_TEXT_INPUTS_DOCSTRING) def get_text_features( self, @@ -1432,9 +1406,6 @@ def get_text_features( attention_mask: Optional[torch.Tensor] = None, token_type_ids: Optional[torch.Tensor] = None, position_ids: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, ) -> torch.FloatTensor: r""" Returns: @@ -1453,36 +1424,24 @@ def get_text_features( >>> text_features = model.get_text_features(**inputs) >>> text_features = text_features / text_features.norm(p=2, dim=-1, keepdim=True) ```""" - # Use CHINESE_CLIP model's config for some fields (if specified) instead of those of vision & text components. - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - text_outputs = self.text_model( + text_outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.text_model( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + output_attentions=False, + output_hidden_states=False, ) - - pooled_output = text_outputs[0][:, 0, :] + pooled_output = text_outputs.last_hidden_state[:, 0, :] text_features = self.text_projection(pooled_output) - return text_features + @can_return_tuple @add_start_docstrings_to_model_forward(CHINESE_CLIP_VISION_INPUTS_DOCSTRING) def get_image_features( self, pixel_values: Optional[torch.FloatTensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, interpolate_pos_encoding: bool = False, - return_dict: Optional[bool] = None, ) -> torch.FloatTensor: r""" Returns: @@ -1507,26 +1466,17 @@ def get_image_features( >>> image_features = model.get_image_features(**inputs) >>> image_features = image_features / image_features.norm(p=2, dim=-1, keepdim=True) ```""" - # Use CHINESE_CLIP model's config for some fields (if specified) instead of those of vision & text components. - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - vision_outputs = self.vision_model( + vision_outputs: BaseModelOutputWithPooling = self.vision_model( pixel_values=pixel_values, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, + output_attentions=False, + output_hidden_states=False, interpolate_pos_encoding=interpolate_pos_encoding, - return_dict=return_dict, ) - - pooled_output = vision_outputs[1] # pooled_output + pooled_output = vision_outputs.pooler_output image_features = self.visual_projection(pooled_output) - return image_features + @can_return_tuple @add_start_docstrings_to_model_forward(CHINESE_CLIP_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=ChineseCLIPOutput, config_class=ChineseCLIPConfig) def forward( @@ -1540,8 +1490,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, interpolate_pos_encoding: bool = False, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, ChineseCLIPOutput]: + ) -> ChineseCLIPOutput: r""" Returns: @@ -1569,30 +1518,27 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - vision_outputs = self.vision_model( + vision_outputs: BaseModelOutputWithPooling = self.vision_model( pixel_values=pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, interpolate_pos_encoding=interpolate_pos_encoding, - return_dict=return_dict, ) - text_outputs = self.text_model( + text_outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.text_model( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - image_embeds = vision_outputs[1] + image_embeds = vision_outputs.pooler_output image_embeds = self.visual_projection(image_embeds) - text_embeds = text_outputs[0][:, 0, :] + text_embeds = text_outputs.last_hidden_state[:, 0, :] text_embeds = self.text_projection(text_embeds) # normalized features @@ -1608,14 +1554,6 @@ def forward( if return_loss: loss = chinese_clip_loss(logits_per_text) - if not return_dict: - # fix the None pooled_output of text_outputs to conform with dict_output - pooled_output = text_outputs[1] - if pooled_output is None: - text_outputs = (text_outputs[0],) + text_outputs[2:] - output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs) - return ((loss,) + output) if loss is not None else output - return ChineseCLIPOutput( loss=loss, logits_per_image=logits_per_image, diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py index a7a51cc86af3..99e6ac397812 100644 --- a/src/transformers/models/clap/modeling_clap.py +++ b/src/transformers/models/clap/modeling_clap.py @@ -35,6 +35,7 @@ ModelOutput, add_start_docstrings, add_start_docstrings_to_model_forward, + can_return_tuple, logging, replace_return_docstrings, torch_int, @@ -892,6 +893,7 @@ def reshape_mel2img(self, normalized_input_features): return normalized_input_features + @can_return_tuple def forward( self, input_features, @@ -901,8 +903,7 @@ def forward( output_hidden_states: Optional[bool] = False, output_hidden_states_before_downsampling: Optional[bool] = False, always_partition: Optional[bool] = False, - return_dict: Optional[bool] = True, - ) -> Union[Tuple, ClapAudioModelOutput]: + ) -> BaseModelOutputWithPooling: input_features = input_features.transpose(1, 3) normalized_input_features = self.batch_norm(input_features) normalized_input_features = normalized_input_features.transpose(1, 3) @@ -997,18 +998,6 @@ def forward( latent_output = self.avgpool(torch.flatten(last_hidden_state, 2)) latent_output = torch.flatten(latent_output, 1) - if not return_dict: - return tuple( - v - for v in [ - last_hidden_state, - latent_output, - all_reshaped_hidden_states, - all_self_attentions, - ] - if v is not None - ) - return BaseModelOutputWithPooling( last_hidden_state=last_hidden_state, pooler_output=latent_output, @@ -1561,6 +1550,7 @@ def __init__(self, config): self.layer = nn.ModuleList([ClapTextLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False + @can_return_tuple def forward( self, hidden_states: torch.Tensor, @@ -1572,8 +1562,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = False, output_hidden_states: Optional[bool] = False, - return_dict: Optional[bool] = True, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: + ) -> BaseModelOutputWithPastAndCrossAttentions: all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None @@ -1626,18 +1615,6 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - if not return_dict: - return tuple( - v - for v in [ - hidden_states, - next_decoder_cache, - all_hidden_states, - all_self_attentions, - all_cross_attentions, - ] - if v is not None - ) return BaseModelOutputWithPastAndCrossAttentions( last_hidden_state=hidden_states, past_key_values=next_decoder_cache, @@ -1709,6 +1686,7 @@ def __init__(self, config: ClapAudioConfig): def get_input_embeddings(self) -> nn.Module: return self.audio_encoder.patch_embed.proj + @can_return_tuple @add_start_docstrings_to_model_forward(CLAP_AUDIO_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=ClapAudioConfig) def forward( @@ -1717,8 +1695,7 @@ def forward( is_longer: Optional[torch.BoolTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPooling]: + ) -> BaseModelOutputWithPooling: r""" Returns: @@ -1739,7 +1716,6 @@ def forward( >>> outputs = model(**inputs) >>> last_hidden_state = outputs.last_hidden_state ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -1750,7 +1726,6 @@ def forward( is_longer=is_longer, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) @@ -1790,6 +1765,7 @@ def get_input_embeddings(self): def set_input_embeddings(self, value): self.embeddings.word_embeddings = value + @can_return_tuple def forward( self, input_ids: Optional[torch.Tensor] = None, @@ -1804,8 +1780,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: r""" encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if @@ -1830,7 +1805,6 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if self.config.is_decoder: use_cache = use_cache if use_cache is not None else self.config.use_cache @@ -1893,7 +1867,7 @@ def forward( inputs_embeds=inputs_embeds, past_key_values_length=past_key_values_length, ) - encoder_outputs = self.encoder( + encoder_outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.encoder( embedding_output, attention_mask=extended_attention_mask, head_mask=head_mask, @@ -1903,14 +1877,10 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = encoder_outputs[0] + sequence_output = encoder_outputs.last_hidden_state pooled_output = self.pooler(sequence_output) if self.pooler is not None else None - if not return_dict: - return (sequence_output, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPoolingAndCrossAttentions( last_hidden_state=sequence_output, pooler_output=pooled_output, @@ -1963,10 +1933,7 @@ def get_text_features( input_ids: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> torch.FloatTensor: + ) -> torch.Tensor: r""" Returns: text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by @@ -1983,23 +1950,15 @@ def get_text_features( >>> inputs = tokenizer(["the sound of a cat", "the sound of a dog"], padding=True, return_tensors="pt") >>> text_features = model.get_text_features(**inputs) ```""" - # Use CLAP model's config for some fields (if specified) instead of those of audio & text components. - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - text_outputs = self.text_model( + text_outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.text_model( input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + output_attentions=False, + output_hidden_states=False, ) - pooled_output = text_outputs[1] if return_dict is not None else text_outputs.pooler_output + pooled_output = text_outputs.pooler_output text_features = self.text_projection(pooled_output) text_features = F.normalize(text_features, dim=-1) @@ -2010,11 +1969,7 @@ def get_audio_features( self, input_features: Optional[torch.Tensor] = None, is_longer: Optional[torch.Tensor] = None, - attention_mask: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> torch.FloatTensor: + ) -> torch.Tensor: r""" Returns: audio_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The audio embeddings obtained by @@ -2032,25 +1987,19 @@ def get_audio_features( >>> inputs = feature_extractor(random_audio, return_tensors="pt") >>> audio_features = model.get_audio_features(**inputs) ```""" - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - audio_outputs = self.audio_model( + audio_outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.audio_model( input_features=input_features, is_longer=is_longer, - return_dict=return_dict, ) - pooled_output = audio_outputs[1] if not return_dict else audio_outputs.pooler_output - + pooled_output = audio_outputs.pooler_output audio_features = self.audio_projection(pooled_output) audio_features = F.normalize(audio_features, dim=-1) return audio_features + @can_return_tuple @add_start_docstrings_to_model_forward(CLAP_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=ClapOutput, config_class=ClapConfig) def forward( @@ -2063,7 +2012,6 @@ def forward( return_loss: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, ) -> Union[Tuple, ClapOutput]: r""" Returns: @@ -2093,29 +2041,26 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - audio_outputs = self.audio_model( + audio_outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.audio_model( input_features=input_features, is_longer=is_longer, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - text_outputs = self.text_model( + text_outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.text_model( input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - audio_embeds = audio_outputs[1] if not return_dict else audio_outputs.pooler_output + audio_embeds = audio_outputs.pooler_output audio_embeds = self.audio_projection(audio_embeds) - text_embeds = text_outputs[1] if not return_dict else text_outputs.pooler_output + text_embeds = text_outputs.pooler_output text_embeds = self.text_projection(text_embeds) # normalized features @@ -2134,10 +2079,6 @@ def forward( audio_loss = contrastive_loss(logits_per_audio.t()) loss = (caption_loss + audio_loss) / 2.0 - if not return_dict: - output = (logits_per_audio, logits_per_text, text_embeds, audio_embeds, text_outputs, audio_outputs) - return ((loss,) + output) if loss is not None else output - return ClapOutput( loss=loss, logits_per_audio=logits_per_audio, @@ -2171,6 +2112,7 @@ def get_input_embeddings(self) -> nn.Module: def set_input_embeddings(self, value): self.text_model.embeddings.word_embeddings = value + @can_return_tuple @add_start_docstrings_to_model_forward(CLAP_TEXT_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=ClapTextModelOutput, config_class=ClapTextConfig) def forward( @@ -2180,8 +2122,7 @@ def forward( position_ids: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, ClapTextModelOutput]: + ) -> ClapTextModelOutput: r""" Returns: @@ -2198,25 +2139,17 @@ def forward( >>> outputs = model(**inputs) >>> text_embeds = outputs.text_embeds ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - text_outputs = self.text_model( + text_outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.text_model( input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - - pooled_output = text_outputs[1] if not return_dict else text_outputs.pooler_output - + pooled_output = text_outputs.pooler_output text_embeds = self.text_projection(pooled_output) - if not return_dict: - outputs = (text_embeds, text_outputs[0]) + text_outputs[2:] - return tuple(output for output in outputs if output is not None) - return ClapTextModelOutput( text_embeds=text_embeds, last_hidden_state=text_outputs.last_hidden_state, @@ -2245,6 +2178,7 @@ def __init__(self, config: ClapAudioConfig): def get_input_embeddings(self) -> nn.Module: return self.audio_model.audio_encoder.patch_embed.proj + @can_return_tuple @add_start_docstrings_to_model_forward(CLAP_AUDIO_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=ClapAudioModelOutput, config_class=ClapAudioConfig) def forward( @@ -2253,8 +2187,7 @@ def forward( is_longer: Optional[torch.BoolTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, ClapAudioModelOutput]: + ) -> ClapAudioModelOutput: r""" Returns: @@ -2274,28 +2207,21 @@ def forward( >>> outputs = model(**inputs) >>> audio_embeds = outputs.audio_embeds ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - audio_outputs = self.audio_model( + audio_outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.audio_model( input_features=input_features, is_longer=is_longer, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - pooled_output = audio_outputs[1] if not return_dict else audio_outputs.pooler_output - + pooled_output = audio_outputs.pooler_output audio_embeds = self.audio_projection(pooled_output) - if not return_dict: - outputs = (audio_embeds, audio_outputs[0]) + audio_outputs[2:] - return tuple(output for output in outputs if output is not None) - return ClapAudioModelOutput( audio_embeds=audio_embeds, last_hidden_state=audio_outputs.last_hidden_state, diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py index d626914d72f0..985e638ff29f 100644 --- a/src/transformers/models/clipseg/modeling_clipseg.py +++ b/src/transformers/models/clipseg/modeling_clipseg.py @@ -31,6 +31,7 @@ ModelOutput, add_start_docstrings, add_start_docstrings_to_model_forward, + can_return_tuple, logging, replace_return_docstrings, torch_int, @@ -602,6 +603,7 @@ def __init__(self, config: CLIPSegConfig): self.layers = nn.ModuleList([CLIPSegEncoderLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False + @can_return_tuple def forward( self, inputs_embeds, @@ -609,8 +611,7 @@ def forward( causal_attention_mask: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutput]: + ) -> BaseModelOutput: r""" Args: inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): @@ -644,7 +645,6 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict encoder_states = () if output_hidden_states else None all_attentions = () if output_attentions else None @@ -677,8 +677,6 @@ def forward( if output_hidden_states: encoder_states = encoder_states + (hidden_states,) - if not return_dict: - return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) return BaseModelOutput( last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions ) @@ -696,6 +694,7 @@ def __init__(self, config: CLIPSegTextConfig): # For `pooled_output` computation self.eos_token_id = config.eos_token_id + @can_return_tuple @add_start_docstrings_to_model_forward(CLIPSEG_TEXT_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPSegTextConfig) # Adapted from transformers.models.clip.modeling_clip.CLIPTextTransformer.forward with clip->clipseg, CLIP->CLIPSeg @@ -706,8 +705,7 @@ def forward( position_ids: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPooling]: + ) -> BaseModelOutputWithPooling: r""" Returns: @@ -716,7 +714,6 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is None: raise ValueError("You have to specify input_ids") @@ -736,16 +733,15 @@ def forward( # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype) - encoder_outputs = self.encoder( + encoder_outputs: BaseModelOutput = self.encoder( inputs_embeds=hidden_states, attention_mask=attention_mask, causal_attention_mask=causal_attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - last_hidden_state = encoder_outputs[0] + last_hidden_state = encoder_outputs.last_hidden_state last_hidden_state = self.final_layer_norm(last_hidden_state) if self.eos_token_id == 2: @@ -770,9 +766,6 @@ def forward( .argmax(dim=-1), ] - if not return_dict: - return (last_hidden_state, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPooling( last_hidden_state=last_hidden_state, pooler_output=pooled_output, @@ -798,6 +791,7 @@ def get_input_embeddings(self) -> nn.Module: def set_input_embeddings(self, value): self.text_model.embeddings.token_embedding = value + @can_return_tuple @add_start_docstrings_to_model_forward(CLIPSEG_TEXT_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPSegTextConfig) def forward( @@ -807,8 +801,7 @@ def forward( position_ids: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPooling]: + ) -> BaseModelOutputWithPooling: r""" Returns: @@ -832,7 +825,6 @@ def forward( position_ids=position_ids, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) @@ -848,6 +840,7 @@ def __init__(self, config: CLIPSegVisionConfig): self.encoder = CLIPSegEncoder(config) self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) + @can_return_tuple @add_start_docstrings_to_model_forward(CLIPSEG_VISION_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPSegVisionConfig) def forward( @@ -855,7 +848,6 @@ def forward( pixel_values: Optional[torch.FloatTensor], output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, interpolate_pos_encoding: Optional[bool] = True, ) -> Union[Tuple, BaseModelOutputWithPooling]: r""" @@ -866,25 +858,20 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding) hidden_states = self.pre_layrnorm(hidden_states) - encoder_outputs = self.encoder( + encoder_outputs: BaseModelOutput = self.encoder( inputs_embeds=hidden_states, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - last_hidden_state = encoder_outputs[0] + last_hidden_state = encoder_outputs.last_hidden_state pooled_output = last_hidden_state[:, 0, :] pooled_output = self.post_layernorm(pooled_output) - if not return_dict: - return (last_hidden_state, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPooling( last_hidden_state=last_hidden_state, pooler_output=pooled_output, @@ -906,6 +893,7 @@ def __init__(self, config: CLIPSegVisionConfig): def get_input_embeddings(self) -> nn.Module: return self.vision_model.embeddings.patch_embedding + @can_return_tuple @add_start_docstrings_to_model_forward(CLIPSEG_VISION_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=CLIPSegVisionConfig) def forward( @@ -914,8 +902,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, interpolate_pos_encoding: Optional[bool] = True, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPooling]: + ) -> BaseModelOutputWithPooling: r""" Returns: @@ -943,7 +930,6 @@ def forward( output_attentions=output_attentions, output_hidden_states=output_hidden_states, interpolate_pos_encoding=interpolate_pos_encoding, - return_dict=return_dict, ) @@ -989,10 +975,7 @@ def get_text_features( input_ids: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> torch.FloatTensor: + ) -> torch.Tensor: r""" Returns: text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by @@ -1009,36 +992,23 @@ def get_text_features( >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt") >>> text_features = model.get_text_features(**inputs) ```""" - # Use CLIPSEG model's config for some fields (if specified) instead of those of vision & text components. - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - text_outputs = self.text_model( + text_outputs: BaseModelOutputWithPooling = self.text_model( input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, + output_attentions=False, + output_hidden_states=False, ) - - pooled_output = text_outputs[1] + pooled_output = text_outputs.pooler_output text_features = self.text_projection(pooled_output) - return text_features @add_start_docstrings_to_model_forward(CLIPSEG_VISION_INPUTS_DOCSTRING) def get_image_features( self, pixel_values: Optional[torch.FloatTensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, interpolate_pos_encoding: bool = True, - return_dict: Optional[bool] = None, - ) -> torch.FloatTensor: + ) -> torch.Tensor: r""" Returns: image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by @@ -1061,26 +1031,17 @@ def get_image_features( >>> image_features = model.get_image_features(**inputs) ```""" - # Use CLIPSEG model's config for some fields (if specified) instead of those of vision & text components. - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - vision_outputs = self.vision_model( + vision_outputs: BaseModelOutputWithPooling = self.vision_model( pixel_values=pixel_values, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, + output_attentions=False, + output_hidden_states=False, interpolate_pos_encoding=interpolate_pos_encoding, - return_dict=return_dict, ) - - pooled_output = vision_outputs[1] # pooled_output + pooled_output = vision_outputs.pooler_output image_features = self.visual_projection(pooled_output) - return image_features + @can_return_tuple @add_start_docstrings_to_model_forward(CLIPSEG_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=CLIPSegOutput, config_class=CLIPSegConfig) def forward( @@ -1093,8 +1054,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, interpolate_pos_encoding: bool = True, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, CLIPSegOutput]: + ) -> CLIPSegOutput: r""" Returns: @@ -1124,29 +1084,26 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - vision_outputs = self.vision_model( + vision_outputs: BaseModelOutputWithPooling = self.vision_model( pixel_values=pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, interpolate_pos_encoding=interpolate_pos_encoding, - return_dict=return_dict, ) - text_outputs = self.text_model( + text_outputs: BaseModelOutputWithPooling = self.text_model( input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - image_embeds = vision_outputs[1] + image_embeds = vision_outputs.pooler_output image_embeds = self.visual_projection(image_embeds) - text_embeds = text_outputs[1] + text_embeds = text_outputs.pooler_output text_embeds = self.text_projection(text_embeds) # normalized features @@ -1162,10 +1119,6 @@ def forward( if return_loss: loss = clipseg_loss(logits_per_text) - if not return_dict: - output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs) - return ((loss,) + output) if loss is not None else output - return CLIPSegOutput( loss=loss, logits_per_image=logits_per_image, @@ -1277,13 +1230,13 @@ def __init__(self, config: CLIPSegConfig): decoder_config.hidden_act = "relu" self.layers = nn.ModuleList([CLIPSegDecoderLayer(decoder_config) for _ in range(len(config.extract_layers))]) + @can_return_tuple def forward( self, hidden_states: Tuple[torch.Tensor], conditional_embeddings: torch.Tensor, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = True, ): all_hidden_states = () if output_hidden_states else None all_attentions = () if output_attentions else None @@ -1324,9 +1277,6 @@ def forward( logits = self.transposed_convolution(output).squeeze(1) - if not return_dict: - return tuple(v for v in [logits, all_hidden_states, all_attentions] if v is not None) - return CLIPSegDecoderOutput( logits=logits, hidden_states=all_hidden_states, @@ -1359,10 +1309,10 @@ def __init__(self, config: CLIPSegConfig): def get_conditional_embeddings( self, batch_size: Optional[int] = None, - input_ids: Optional[torch.Tensor] = None, + input_ids: Optional[torch.FloatTensor] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.Tensor] = None, - conditional_pixel_values: Optional[torch.Tensor] = None, + conditional_pixel_values: Optional[torch.FloatTensor] = None, ): if input_ids is not None: # compute conditional embeddings from texts @@ -1385,6 +1335,7 @@ def get_conditional_embeddings( return conditional_embeddings + @can_return_tuple @add_start_docstrings_to_model_forward(CLIPSEG_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=CLIPSegImageSegmentationOutput, config_class=CLIPSegTextConfig) def forward( @@ -1399,8 +1350,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, interpolate_pos_encoding: bool = True, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, CLIPSegOutput]: + ) -> CLIPSegImageSegmentationOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., @@ -1430,36 +1380,21 @@ def forward( >>> print(logits.shape) torch.Size([3, 352, 352]) ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict # step 1: forward the query images through the frozen CLIP vision encoder with torch.no_grad(): - vision_outputs = self.clip.vision_model( + vision_outputs: BaseModelOutputWithPooling = self.clip.vision_model( pixel_values=pixel_values, output_attentions=output_attentions, output_hidden_states=True, # we need the intermediate hidden states interpolate_pos_encoding=interpolate_pos_encoding, - return_dict=return_dict, ) pooled_output = self.clip.visual_projection(vision_outputs[1]) - hidden_states = vision_outputs.hidden_states if return_dict else vision_outputs[2] + hidden_states = vision_outputs.hidden_states # we add +1 here as the hidden states also include the initial embeddings activations = [hidden_states[i + 1] for i in self.extract_layers] - # update vision_outputs - if return_dict: - vision_outputs = BaseModelOutputWithPooling( - last_hidden_state=vision_outputs.last_hidden_state, - pooler_output=vision_outputs.pooler_output, - hidden_states=vision_outputs.hidden_states if output_hidden_states else None, - attentions=vision_outputs.attentions, - ) - else: - vision_outputs = ( - vision_outputs[:2] + vision_outputs[3:] if not output_hidden_states else vision_outputs - ) - # step 2: compute conditional embeddings, either from text, images or an own provided embedding if conditional_embeddings is None: conditional_embeddings = self.get_conditional_embeddings( @@ -1481,14 +1416,13 @@ def forward( ) # step 3: forward both the pooled output and the activations through the lightweight decoder to predict masks - decoder_outputs = self.decoder( + decoder_outputs: CLIPSegDecoderOutput = self.decoder( activations, conditional_embeddings, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - logits = decoder_outputs.logits if return_dict else decoder_outputs[0] + logits = decoder_outputs.logits loss = None if labels is not None: @@ -1497,10 +1431,6 @@ def forward( loss_fn = nn.BCEWithLogitsLoss() loss = loss_fn(logits, labels) - if not return_dict: - output = (logits, conditional_embeddings, pooled_output, vision_outputs, decoder_outputs) - return ((loss,) + output) if loss is not None else output - return CLIPSegImageSegmentationOutput( loss=loss, logits=logits, diff --git a/src/transformers/models/data2vec/modeling_data2vec_text.py b/src/transformers/models/data2vec/modeling_data2vec_text.py index 5f84eca754e8..321d56f97057 100644 --- a/src/transformers/models/data2vec/modeling_data2vec_text.py +++ b/src/transformers/models/data2vec/modeling_data2vec_text.py @@ -15,7 +15,7 @@ """PyTorch Data2VecText model.""" import math -from typing import List, Optional, Tuple, Union +from typing import List, Optional, Tuple import torch import torch.utils.checkpoint @@ -40,6 +40,7 @@ add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, + can_return_tuple, logging, replace_return_docstrings, ) @@ -479,6 +480,7 @@ def __init__(self, config): self.layer = nn.ModuleList([Data2VecTextLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False + @can_return_tuple def forward( self, hidden_states: torch.Tensor, @@ -490,8 +492,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = False, output_hidden_states: Optional[bool] = False, - return_dict: Optional[bool] = True, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: + ) -> BaseModelOutputWithPastAndCrossAttentions: all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None @@ -544,18 +545,6 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - if not return_dict: - return tuple( - v - for v in [ - hidden_states, - next_decoder_cache, - all_hidden_states, - all_self_attentions, - all_cross_attentions, - ] - if v is not None - ) return BaseModelOutputWithPastAndCrossAttentions( last_hidden_state=hidden_states, past_key_values=next_decoder_cache, @@ -726,6 +715,7 @@ class PreTrainedModel for layer, heads in heads_to_prune.items(): self.encoder.layer[layer].attention.prune_heads(heads) + @can_return_tuple @add_start_docstrings_to_model_forward(DATA2VECTEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -747,8 +737,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: r""" encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if @@ -773,7 +762,6 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if self.config.is_decoder: use_cache = use_cache if use_cache is not None else self.config.use_cache @@ -836,7 +824,7 @@ def forward( inputs_embeds=inputs_embeds, past_key_values_length=past_key_values_length, ) - encoder_outputs = self.encoder( + encoder_outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.encoder( embedding_output, attention_mask=extended_attention_mask, head_mask=head_mask, @@ -846,14 +834,10 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = encoder_outputs[0] + sequence_output = encoder_outputs.last_hidden_state pooled_output = self.pooler(sequence_output) if self.pooler is not None else None - if not return_dict: - return (sequence_output, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPoolingAndCrossAttentions( last_hidden_state=sequence_output, pooler_output=pooled_output, @@ -888,6 +872,7 @@ def get_output_embeddings(self): def set_output_embeddings(self, new_embeddings): self.lm_head.decoder = new_embeddings + @can_return_tuple @add_start_docstrings_to_model_forward(DATA2VECTEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC) def forward( @@ -905,9 +890,8 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, **kwargs, - ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]: + ) -> CausalLMOutputWithCrossAttentions: r""" encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if @@ -951,11 +935,10 @@ def forward( >>> prediction_logits = outputs.logits ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if labels is not None: use_cache = False - outputs = self.data2vec_text( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.data2vec_text( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -968,10 +951,9 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state prediction_scores = self.lm_head(sequence_output) lm_loss = None @@ -983,10 +965,6 @@ def forward( **kwargs, ) - if not return_dict: - output = (prediction_scores,) + outputs[2:] - return ((lm_loss,) + output) if lm_loss is not None else output - return CausalLMOutputWithCrossAttentions( loss=lm_loss, logits=prediction_scores, @@ -1030,6 +1008,7 @@ def get_output_embeddings(self): def set_output_embeddings(self, new_embeddings): self.lm_head.decoder = new_embeddings + @can_return_tuple @add_start_docstrings_to_model_forward(DATA2VECTEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -1050,8 +1029,7 @@ def forward( labels: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, MaskedLMOutput]: + ) -> MaskedLMOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., @@ -1060,9 +1038,7 @@ def forward( kwargs (`Dict[str, any]`, *optional*, defaults to *{}*): Used to hide legacy arguments that have been deprecated. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.data2vec_text( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.data2vec_text( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1073,9 +1049,8 @@ def forward( encoder_attention_mask=encoder_attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state prediction_scores = self.lm_head(sequence_output) masked_lm_loss = None @@ -1085,10 +1060,6 @@ def forward( labels = labels.to(prediction_scores.device) masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) - if not return_dict: - output = (prediction_scores,) + outputs[2:] - return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output - return MaskedLMOutput( loss=masked_lm_loss, logits=prediction_scores, @@ -1148,6 +1119,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(DATA2VECTEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -1165,17 +1137,14 @@ def forward( labels: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, SequenceClassifierOutput]: + ) -> SequenceClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.data2vec_text( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.data2vec_text( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1184,9 +1153,8 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state logits = self.classifier(sequence_output) loss = None @@ -1214,10 +1182,6 @@ def forward( loss_fct = BCEWithLogitsLoss() loss = loss_fct(logits, labels) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return SequenceClassifierOutput( loss=loss, logits=logits, @@ -1244,6 +1208,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward( DATA2VECTEXT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length") ) @@ -1263,15 +1228,13 @@ def forward( inputs_embeds: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, MultipleChoiceModelOutput]: + ) -> MultipleChoiceModelOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above) """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None @@ -1284,7 +1247,7 @@ def forward( else None ) - outputs = self.data2vec_text( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.data2vec_text( flat_input_ids, position_ids=flat_position_ids, token_type_ids=flat_token_type_ids, @@ -1293,9 +1256,8 @@ def forward( inputs_embeds=flat_inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - pooled_output = outputs[1] + pooled_output = outputs.pooler_output pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) @@ -1308,10 +1270,6 @@ def forward( labels = labels.to(reshaped_logits.device) loss = loss_fct(reshaped_logits, labels) - if not return_dict: - output = (reshaped_logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return MultipleChoiceModelOutput( loss=loss, logits=reshaped_logits, @@ -1342,6 +1300,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(DATA2VECTEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -1359,15 +1318,12 @@ def forward( labels: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, TokenClassifierOutput]: + ) -> TokenClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.data2vec_text( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.data2vec_text( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1376,10 +1332,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state sequence_output = self.dropout(sequence_output) logits = self.classifier(sequence_output) @@ -1391,10 +1346,6 @@ def forward( labels = labels.to(logits.device) loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return TokenClassifierOutput( loss=loss, logits=logits, @@ -1444,6 +1395,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(DATA2VECTEXT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -1462,8 +1414,7 @@ def forward( end_positions: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, QuestionAnsweringModelOutput]: + ) -> QuestionAnsweringModelOutput: r""" start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for position (index) of the start of the labelled span for computing the token classification loss. @@ -1474,9 +1425,7 @@ def forward( Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.data2vec_text( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.data2vec_text( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1485,10 +1434,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) @@ -1512,10 +1460,6 @@ def forward( end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 - if not return_dict: - output = (start_logits, end_logits) + outputs[2:] - return ((total_loss,) + output) if total_loss is not None else output - return QuestionAnsweringModelOutput( loss=total_loss, start_logits=start_logits, diff --git a/src/transformers/models/electra/modeling_electra.py b/src/transformers/models/electra/modeling_electra.py index 7b73f022122d..c5a48b8c945e 100644 --- a/src/transformers/models/electra/modeling_electra.py +++ b/src/transformers/models/electra/modeling_electra.py @@ -17,7 +17,7 @@ import math import os from dataclasses import dataclass -from typing import List, Optional, Tuple, Union +from typing import List, Optional, Tuple import torch import torch.utils.checkpoint @@ -43,6 +43,7 @@ add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, + can_return_tuple, logging, replace_return_docstrings, ) @@ -536,6 +537,7 @@ def __init__(self, config): self.layer = nn.ModuleList([ElectraLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False + @can_return_tuple def forward( self, hidden_states: torch.Tensor, @@ -547,8 +549,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = False, output_hidden_states: Optional[bool] = False, - return_dict: Optional[bool] = True, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: + ) -> BaseModelOutputWithPastAndCrossAttentions: all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None @@ -601,18 +602,6 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - if not return_dict: - return tuple( - v - for v in [ - hidden_states, - next_decoder_cache, - all_hidden_states, - all_self_attentions, - all_cross_attentions, - ] - if v is not None - ) return BaseModelOutputWithPastAndCrossAttentions( last_hidden_state=hidden_states, past_key_values=next_decoder_cache, @@ -827,6 +816,7 @@ class PreTrainedModel for layer, heads in heads_to_prune.items(): self.encoder.layer[layer].attention.prune_heads(heads) + @can_return_tuple @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -847,13 +837,11 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithCrossAttentions]: + ) -> BaseModelOutputWithPastAndCrossAttentions: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -907,7 +895,7 @@ def forward( if hasattr(self, "embeddings_project"): hidden_states = self.embeddings_project(hidden_states) - hidden_states = self.encoder( + outputs: BaseModelOutputWithPastAndCrossAttentions = self.encoder( hidden_states, attention_mask=extended_attention_mask, head_mask=head_mask, @@ -917,10 +905,8 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - - return hidden_states + return outputs class ElectraClassificationHead(nn.Module): @@ -964,6 +950,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint="bhadresh-savani/electra-base-emotion", @@ -983,17 +970,15 @@ def forward( labels: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]: + ) -> SequenceClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - discriminator_hidden_states = self.electra( + discriminator_outputs: BaseModelOutputWithPastAndCrossAttentions = self.electra( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1002,10 +987,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = discriminator_hidden_states[0] + sequence_output = discriminator_outputs.last_hidden_state logits = self.classifier(sequence_output) loss = None @@ -1031,15 +1015,11 @@ def forward( loss_fct = BCEWithLogitsLoss() loss = loss_fct(logits, labels) - if not return_dict: - output = (logits,) + discriminator_hidden_states[1:] - return ((loss,) + output) if loss is not None else output - return SequenceClassifierOutput( loss=loss, logits=logits, - hidden_states=discriminator_hidden_states.hidden_states, - attentions=discriminator_hidden_states.attentions, + hidden_states=discriminator_outputs.hidden_states, + attentions=discriminator_outputs.attentions, ) @@ -1060,6 +1040,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=ElectraForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) def forward( @@ -1073,8 +1054,7 @@ def forward( labels: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], ElectraForPreTrainingOutput]: + ) -> ElectraForPreTrainingOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the ELECTRA loss. Input should be a sequence of tokens (see `input_ids` docstring) @@ -1108,9 +1088,7 @@ def forward( >>> predictions.squeeze().tolist() [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0] ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - discriminator_hidden_states = self.electra( + discriminator_outputs: BaseModelOutputWithPastAndCrossAttentions = self.electra( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1119,32 +1097,26 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - discriminator_sequence_output = discriminator_hidden_states[0] - - logits = self.discriminator_predictions(discriminator_sequence_output) + sequence_output = discriminator_outputs.last_hidden_state + logits = self.discriminator_predictions(sequence_output) loss = None if labels is not None: loss_fct = nn.BCEWithLogitsLoss() if attention_mask is not None: - active_loss = attention_mask.view(-1, discriminator_sequence_output.shape[1]) == 1 - active_logits = logits.view(-1, discriminator_sequence_output.shape[1])[active_loss] + active_loss = attention_mask.view(-1, sequence_output.shape[1]) == 1 + active_logits = logits.view(-1, sequence_output.shape[1])[active_loss] active_labels = labels[active_loss] loss = loss_fct(active_logits, active_labels.float()) else: - loss = loss_fct(logits.view(-1, discriminator_sequence_output.shape[1]), labels.float()) - - if not return_dict: - output = (logits,) + discriminator_hidden_states[1:] - return ((loss,) + output) if loss is not None else output + loss = loss_fct(logits.view(-1, sequence_output.shape[1]), labels.float()) return ElectraForPreTrainingOutput( loss=loss, logits=logits, - hidden_states=discriminator_hidden_states.hidden_states, - attentions=discriminator_hidden_states.attentions, + hidden_states=discriminator_outputs.hidden_states, + attentions=discriminator_outputs.attentions, ) @@ -1176,6 +1148,7 @@ def get_output_embeddings(self): def set_output_embeddings(self, word_embeddings): self.generator_lm_head = word_embeddings + @can_return_tuple @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint="google/electra-small-generator", @@ -1196,17 +1169,14 @@ def forward( labels: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]: + ) -> MaskedLMOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - generator_hidden_states = self.electra( + generator_outputs: BaseModelOutputWithPastAndCrossAttentions = self.electra( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1215,11 +1185,10 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - generator_sequence_output = generator_hidden_states[0] + sequence_output = generator_outputs.last_hidden_state - prediction_scores = self.generator_predictions(generator_sequence_output) + prediction_scores = self.generator_predictions(sequence_output) prediction_scores = self.generator_lm_head(prediction_scores) loss = None @@ -1228,15 +1197,11 @@ def forward( loss_fct = nn.CrossEntropyLoss() # -100 index = padding token loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) - if not return_dict: - output = (prediction_scores,) + generator_hidden_states[1:] - return ((loss,) + output) if loss is not None else output - return MaskedLMOutput( loss=loss, logits=prediction_scores, - hidden_states=generator_hidden_states.hidden_states, - attentions=generator_hidden_states.attentions, + hidden_states=generator_outputs.hidden_states, + attentions=generator_outputs.attentions, ) @@ -1262,6 +1227,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint="bhadresh-savani/electra-base-discriminator-finetuned-conll03-english", @@ -1281,15 +1247,12 @@ def forward( labels: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]: + ) -> TokenClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - discriminator_hidden_states = self.electra( + discriminator_outputs: BaseModelOutputWithPastAndCrossAttentions = self.electra( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1298,10 +1261,8 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - discriminator_sequence_output = discriminator_hidden_states[0] - + discriminator_sequence_output = discriminator_outputs.last_hidden_state discriminator_sequence_output = self.dropout(discriminator_sequence_output) logits = self.classifier(discriminator_sequence_output) @@ -1310,15 +1271,11 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if not return_dict: - output = (logits,) + discriminator_hidden_states[1:] - return ((loss,) + output) if loss is not None else output - return TokenClassifierOutput( loss=loss, logits=logits, - hidden_states=discriminator_hidden_states.hidden_states, - attentions=discriminator_hidden_states.attentions, + hidden_states=discriminator_outputs.hidden_states, + attentions=discriminator_outputs.attentions, ) @@ -1343,6 +1300,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint="bhadresh-savani/electra-base-squad2", @@ -1365,8 +1323,7 @@ def forward( end_positions: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]: + ) -> QuestionAnsweringModelOutput: r""" start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for position (index) of the start of the labelled span for computing the token classification loss. @@ -1377,9 +1334,8 @@ def forward( Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - discriminator_hidden_states = self.electra( + discriminator_outputs: BaseModelOutputWithPastAndCrossAttentions = self.electra( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1390,8 +1346,7 @@ def forward( output_hidden_states=output_hidden_states, ) - sequence_output = discriminator_hidden_states[0] - + sequence_output = discriminator_outputs.last_hidden_state logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) start_logits = start_logits.squeeze(-1).contiguous() @@ -1414,19 +1369,12 @@ def forward( end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 - if not return_dict: - output = ( - start_logits, - end_logits, - ) + discriminator_hidden_states[1:] - return ((total_loss,) + output) if total_loss is not None else output - return QuestionAnsweringModelOutput( loss=total_loss, start_logits=start_logits, end_logits=end_logits, - hidden_states=discriminator_hidden_states.hidden_states, - attentions=discriminator_hidden_states.attentions, + hidden_states=discriminator_outputs.hidden_states, + attentions=discriminator_outputs.attentions, ) @@ -1448,6 +1396,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -1465,15 +1414,13 @@ def forward( labels: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]: + ) -> MultipleChoiceModelOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above) """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None @@ -1486,7 +1433,7 @@ def forward( else None ) - discriminator_hidden_states = self.electra( + discriminator_outputs: BaseModelOutputWithPastAndCrossAttentions = self.electra( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1495,11 +1442,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = discriminator_hidden_states[0] - + sequence_output = discriminator_outputs.last_hidden_state pooled_output = self.sequence_summary(sequence_output) logits = self.classifier(pooled_output) reshaped_logits = logits.view(-1, num_choices) @@ -1509,15 +1454,11 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(reshaped_logits, labels) - if not return_dict: - output = (reshaped_logits,) + discriminator_hidden_states[1:] - return ((loss,) + output) if loss is not None else output - return MultipleChoiceModelOutput( loss=loss, logits=reshaped_logits, - hidden_states=discriminator_hidden_states.hidden_states, - attentions=discriminator_hidden_states.attentions, + hidden_states=discriminator_outputs.hidden_states, + attentions=discriminator_outputs.attentions, ) @@ -1545,6 +1486,7 @@ def get_output_embeddings(self): def set_output_embeddings(self, new_embeddings): self.generator_lm_head = new_embeddings + @can_return_tuple @add_start_docstrings_to_model_forward(ELECTRA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC) def forward( @@ -1562,9 +1504,8 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, **kwargs, - ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]: + ) -> CausalLMOutputWithCrossAttentions: r""" encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if @@ -1608,11 +1549,10 @@ def forward( >>> prediction_logits = outputs.logits ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if labels is not None: use_cache = False - outputs = self.electra( + outputs: BaseModelOutputWithPastAndCrossAttentions = self.electra( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1625,10 +1565,8 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state prediction_scores = self.generator_lm_head(self.generator_predictions(sequence_output)) lm_loss = None @@ -1640,10 +1578,6 @@ def forward( **kwargs, ) - if not return_dict: - output = (prediction_scores,) + outputs[1:] - return ((lm_loss,) + output) if lm_loss is not None else output - return CausalLMOutputWithCrossAttentions( loss=lm_loss, logits=prediction_scores, diff --git a/src/transformers/models/ernie/modeling_ernie.py b/src/transformers/models/ernie/modeling_ernie.py index 221dd5748588..cbf4459bfdcb 100644 --- a/src/transformers/models/ernie/modeling_ernie.py +++ b/src/transformers/models/ernie/modeling_ernie.py @@ -17,7 +17,7 @@ import math import warnings from dataclasses import dataclass -from typing import List, Optional, Tuple, Union +from typing import List, Optional, Tuple import torch import torch.utils.checkpoint @@ -44,6 +44,7 @@ add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, + can_return_tuple, logging, replace_return_docstrings, ) @@ -465,6 +466,7 @@ def __init__(self, config): self.layer = nn.ModuleList([ErnieLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False + @can_return_tuple def forward( self, hidden_states: torch.Tensor, @@ -476,8 +478,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = False, output_hidden_states: Optional[bool] = False, - return_dict: Optional[bool] = True, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: + ) -> BaseModelOutputWithPastAndCrossAttentions: all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None @@ -530,18 +531,6 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - if not return_dict: - return tuple( - v - for v in [ - hidden_states, - next_decoder_cache, - all_hidden_states, - all_self_attentions, - all_cross_attentions, - ] - if v is not None - ) return BaseModelOutputWithPastAndCrossAttentions( last_hidden_state=hidden_states, past_key_values=next_decoder_cache, @@ -824,6 +813,7 @@ class PreTrainedModel for layer, heads in heads_to_prune.items(): self.encoder.layer[layer].attention.prune_heads(heads) + @can_return_tuple @add_start_docstrings_to_model_forward(ERNIE_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -845,8 +835,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: r""" encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if @@ -871,7 +860,6 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if self.config.is_decoder: use_cache = use_cache if use_cache is not None else self.config.use_cache @@ -935,7 +923,7 @@ def forward( inputs_embeds=inputs_embeds, past_key_values_length=past_key_values_length, ) - encoder_outputs = self.encoder( + encoder_outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.encoder( embedding_output, attention_mask=extended_attention_mask, head_mask=head_mask, @@ -945,14 +933,10 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = encoder_outputs[0] + sequence_output = encoder_outputs.last_hidden_state pooled_output = self.pooler(sequence_output) if self.pooler is not None else None - if not return_dict: - return (sequence_output, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPoolingAndCrossAttentions( last_hidden_state=sequence_output, pooler_output=pooled_output, @@ -992,6 +976,7 @@ def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings self.cls.predictions.bias = new_embeddings.bias + @can_return_tuple @add_start_docstrings_to_model_forward(ERNIE_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=ErnieForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) def forward( @@ -1007,8 +992,7 @@ def forward( next_sentence_label: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], ErnieForPreTrainingOutput]: + ) -> ErnieForPreTrainingOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., @@ -1041,9 +1025,8 @@ def forward( >>> seq_relationship_logits = outputs.seq_relationship_logits ``` """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.ernie( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.ernie( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1053,10 +1036,10 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output, pooled_output = outputs[:2] + sequence_output = outputs.last_hidden_state + pooled_output = outputs.pooler_output prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output) total_loss = None @@ -1066,10 +1049,6 @@ def forward( next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) total_loss = masked_lm_loss + next_sentence_loss - if not return_dict: - output = (prediction_scores, seq_relationship_score) + outputs[2:] - return ((total_loss,) + output) if total_loss is not None else output - return ErnieForPreTrainingOutput( loss=total_loss, prediction_logits=prediction_scores, @@ -1107,6 +1086,7 @@ def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings self.cls.predictions.bias = new_embeddings.bias + @can_return_tuple @add_start_docstrings_to_model_forward(ERNIE_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -1129,9 +1109,8 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, **kwargs, - ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]: + ) -> CausalLMOutputWithCrossAttentions: r""" encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if @@ -1156,11 +1135,10 @@ def forward( If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see `past_key_values`). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if labels is not None: use_cache = False - outputs = self.ernie( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.ernie( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1174,10 +1152,9 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state prediction_scores = self.cls(sequence_output) lm_loss = None @@ -1189,10 +1166,6 @@ def forward( **kwargs, ) - if not return_dict: - output = (prediction_scores,) + outputs[2:] - return ((lm_loss,) + output) if lm_loss is not None else output - return CausalLMOutputWithCrossAttentions( loss=lm_loss, logits=prediction_scores, @@ -1241,6 +1214,7 @@ def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings self.cls.predictions.bias = new_embeddings.bias + @can_return_tuple @add_start_docstrings_to_model_forward(ERNIE_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -1263,8 +1237,7 @@ def forward( labels: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]: + ) -> MaskedLMOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., @@ -1272,9 +1245,7 @@ def forward( loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.ernie( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.ernie( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1286,10 +1257,9 @@ def forward( encoder_attention_mask=encoder_attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state prediction_scores = self.cls(sequence_output) masked_lm_loss = None @@ -1297,10 +1267,6 @@ def forward( loss_fct = CrossEntropyLoss() # -100 index = padding token masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) - if not return_dict: - output = (prediction_scores,) + outputs[2:] - return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output - return MaskedLMOutput( loss=masked_lm_loss, logits=prediction_scores, @@ -1349,6 +1315,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(ERNIE_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=NextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC) def forward( @@ -1363,9 +1330,8 @@ def forward( labels: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, **kwargs, - ) -> Union[Tuple[torch.Tensor], NextSentencePredictorOutput]: + ) -> NextSentencePredictorOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair @@ -1403,9 +1369,7 @@ def forward( ) labels = kwargs.pop("next_sentence_label") - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.ernie( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.ernie( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1415,11 +1379,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - pooled_output = outputs[1] - + pooled_output = outputs.pooler_output seq_relationship_scores = self.cls(pooled_output) next_sentence_loss = None @@ -1427,10 +1389,6 @@ def forward( loss_fct = CrossEntropyLoss() next_sentence_loss = loss_fct(seq_relationship_scores.view(-1, 2), labels.view(-1)) - if not return_dict: - output = (seq_relationship_scores,) + outputs[2:] - return ((next_sentence_loss,) + output) if next_sentence_loss is not None else output - return NextSentencePredictorOutput( loss=next_sentence_loss, logits=seq_relationship_scores, @@ -1463,6 +1421,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(ERNIE_INPUTS_DOCSTRING.format("batch_size, sequence_length")) def forward( self, @@ -1476,17 +1435,15 @@ def forward( labels: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]: + ) -> SequenceClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.ernie( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.ernie( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1496,11 +1453,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - pooled_output = outputs[1] - + pooled_output = outputs.pooler_output pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) @@ -1526,9 +1481,6 @@ def forward( elif self.config.problem_type == "multi_label_classification": loss_fct = BCEWithLogitsLoss() loss = loss_fct(logits, labels) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output return SequenceClassifierOutput( loss=loss, @@ -1560,6 +1512,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(ERNIE_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -1578,15 +1531,13 @@ def forward( labels: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]: + ) -> MultipleChoiceModelOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above) """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None @@ -1599,7 +1550,7 @@ def forward( else None ) - outputs = self.ernie( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.ernie( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1609,10 +1560,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - pooled_output = outputs[1] + pooled_output = outputs.pooler_output pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) @@ -1623,10 +1573,6 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(reshaped_logits, labels) - if not return_dict: - output = (reshaped_logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return MultipleChoiceModelOutput( loss=loss, logits=reshaped_logits, @@ -1658,6 +1604,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(ERNIE_INPUTS_DOCSTRING.format("batch_size, sequence_length")) def forward( self, @@ -1671,15 +1618,13 @@ def forward( labels: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]: + ) -> TokenClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.ernie( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.ernie( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1689,7 +1634,6 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) sequence_output = outputs[0] @@ -1702,10 +1646,6 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return TokenClassifierOutput( loss=loss, logits=logits, @@ -1733,6 +1673,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(ERNIE_INPUTS_DOCSTRING.format("batch_size, sequence_length")) def forward( self, @@ -1747,8 +1688,7 @@ def forward( end_positions: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]: + ) -> QuestionAnsweringModelOutput: r""" start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for position (index) of the start of the labelled span for computing the token classification loss. @@ -1759,9 +1699,8 @@ def forward( Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.ernie( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.ernie( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1771,11 +1710,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] - + sequence_output = outputs.last_hidden_state logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) start_logits = start_logits.squeeze(-1).contiguous() @@ -1798,10 +1735,6 @@ def forward( end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 - if not return_dict: - output = (start_logits, end_logits) + outputs[2:] - return ((total_loss,) + output) if total_loss is not None else output - return QuestionAnsweringModelOutput( loss=total_loss, start_logits=start_logits, diff --git a/src/transformers/models/git/modeling_git.py b/src/transformers/models/git/modeling_git.py index 047b3a1fc43e..7449f9146468 100644 --- a/src/transformers/models/git/modeling_git.py +++ b/src/transformers/models/git/modeling_git.py @@ -39,6 +39,7 @@ ModelOutput, add_start_docstrings, add_start_docstrings_to_model_forward, + can_return_tuple, logging, replace_return_docstrings, torch_int, @@ -404,6 +405,7 @@ def __init__(self, config): self.layer = nn.ModuleList([GitLayer(config, i) for i in range(config.num_hidden_layers)]) self.gradient_checkpointing = False + @can_return_tuple def forward( self, hidden_states: torch.Tensor, @@ -414,8 +416,7 @@ def forward( output_attentions: Optional[bool] = False, output_hidden_states: Optional[bool] = False, pixel_values_present: Optional[bool] = False, - return_dict: Optional[bool] = True, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPast]: + ) -> BaseModelOutputWithPast: if self.gradient_checkpointing and self.training: if use_cache: logger.warning_once( @@ -478,17 +479,6 @@ def forward( if return_legacy_cache: next_cache = next_cache.to_legacy_cache() - if not return_dict: - return tuple( - v - for v in [ - hidden_states, - next_cache, - all_hidden_states, - all_self_attentions, - ] - if v is not None - ) return BaseModelOutputWithPast( last_hidden_state=hidden_states, past_key_values=next_cache, @@ -882,6 +872,7 @@ def __init__(self, config: GitVisionConfig): self.layers = nn.ModuleList([GitVisionEncoderLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False + @can_return_tuple def forward( self, inputs_embeds, @@ -889,8 +880,7 @@ def forward( causal_attention_mask: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutput]: + ) -> BaseModelOutput: r""" Args: inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): @@ -924,7 +914,6 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict encoder_states = () if output_hidden_states else None all_attentions = () if output_attentions else None @@ -957,8 +946,6 @@ def forward( if output_hidden_states: encoder_states = encoder_states + (hidden_states,) - if not return_dict: - return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) return BaseModelOutput( last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions ) @@ -994,6 +981,7 @@ def __init__(self, config: GitVisionConfig): self.encoder = GitVisionEncoder(config) self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) + @can_return_tuple @add_start_docstrings_to_model_forward(GIT_VISION_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=BaseModelOutput, config_class=GitVisionConfig) def forward( @@ -1002,8 +990,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, interpolate_pos_encoding: Optional[bool] = False, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutput]: + ) -> BaseModelOutput: r""" Returns: @@ -1012,7 +999,6 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -1020,20 +1006,15 @@ def forward( hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding) hidden_states = self.pre_layrnorm(hidden_states) - encoder_outputs = self.encoder( + encoder_outputs: BaseModelOutputWithPast = self.encoder( inputs_embeds=hidden_states, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - last_hidden_state = encoder_outputs[0] - + last_hidden_state = encoder_outputs.last_hidden_state last_hidden_state = self.post_layernorm(last_hidden_state) - if not return_dict: - return (last_hidden_state,) + encoder_outputs[1:] - return BaseModelOutput( last_hidden_state=last_hidden_state, hidden_states=encoder_outputs.hidden_states, @@ -1059,6 +1040,7 @@ def __init__(self, config: GitVisionConfig): def get_input_embeddings(self) -> nn.Module: return self.vision_model.embeddings.patch_embedding + @can_return_tuple @add_start_docstrings_to_model_forward(GIT_VISION_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=BaseModelOutput, config_class=GitVisionConfig) def forward( @@ -1067,8 +1049,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, interpolate_pos_encoding: bool = False, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutput]: + ) -> BaseModelOutput: r""" Returns: @@ -1090,14 +1071,12 @@ def forward( >>> outputs = model(**inputs) >>> last_hidden_state = outputs.last_hidden_state ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict return self.vision_model( pixel_values=pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, interpolate_pos_encoding=interpolate_pos_encoding, - return_dict=return_dict, ) @@ -1209,6 +1188,7 @@ def create_attention_mask(self, tgt, memory, tgt_mask, past_key_values_length, m return full_attention_mask + @can_return_tuple @add_start_docstrings_to_model_forward(GIT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=_CONFIG_FOR_DOC) def forward( @@ -1224,8 +1204,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, interpolate_pos_encoding: bool = False, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPooling]: + ) -> BaseModelOutputWithPast: r""" use_cache (`bool`, *optional*): If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see @@ -1258,7 +1237,6 @@ def forward( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -1359,7 +1337,7 @@ def forward( else: combined_attention_mask[:, :, -input_shape[1] :, -input_shape[1] :] += expanded_attn_mask - encoder_outputs = self.encoder( + encoder_outputs: BaseModelOutputWithPast = self.encoder( hidden_states, attention_mask=combined_attention_mask, head_mask=head_mask, @@ -1367,13 +1345,9 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, pixel_values_present=pixel_values is not None, ) - sequence_output = encoder_outputs[0] - - if not return_dict: - return (sequence_output,) + encoder_outputs[1:] + sequence_output = encoder_outputs.last_hidden_state return BaseModelOutputWithPast( last_hidden_state=sequence_output, @@ -1404,6 +1378,7 @@ def get_output_embeddings(self): def set_output_embeddings(self, new_embeddings): self.output = new_embeddings + @can_return_tuple @add_start_docstrings_to_model_forward(GIT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) def forward( @@ -1420,9 +1395,8 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, interpolate_pos_encoding: bool = False, - return_dict: Optional[bool] = None, **kwargs, - ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithPast]: + ) -> CausalLMOutputWithPast: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in @@ -1559,11 +1533,10 @@ def forward( Generated caption: ['a woman is sitting at a table and she is talking about the food she is holding.'] ``` """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if labels is not None: use_cache = False - outputs = self.git( + outputs: BaseModelOutputWithPast = self.git( input_ids, attention_mask=attention_mask, position_ids=position_ids, @@ -1575,10 +1548,9 @@ def forward( output_attentions=output_attentions, output_hidden_states=output_hidden_states, interpolate_pos_encoding=interpolate_pos_encoding, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state logits = self.output(sequence_output) loss = None @@ -1594,10 +1566,6 @@ def forward( **kwargs, ) - if not return_dict: - output = (logits,) + outputs[1:] - return ((loss,) + output) if loss is not None else output - return CausalLMOutputWithPast( loss=loss, logits=logits, diff --git a/src/transformers/models/idefics/vision.py b/src/transformers/models/idefics/vision.py index 5b2ef5ae3ee3..f23445665251 100644 --- a/src/transformers/models/idefics/vision.py +++ b/src/transformers/models/idefics/vision.py @@ -16,7 +16,7 @@ import math from dataclasses import dataclass -from typing import Callable, Optional, Tuple, Union +from typing import Callable, Optional, Tuple import torch import torch.utils.checkpoint @@ -25,10 +25,7 @@ from ...activations import ACT2FN from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling from ...modeling_utils import ALL_ATTENTION_FUNCTIONS -from ...utils import ( - ModelOutput, - logging, -) +from ...utils import ModelOutput, can_return_tuple, logging from .configuration_idefics import IdeficsVisionConfig @@ -349,6 +346,7 @@ def __init__(self, config: IdeficsVisionConfig): self.layers = nn.ModuleList([IdeficsVisionEncoderLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False + @can_return_tuple def forward( self, inputs_embeds, @@ -356,8 +354,7 @@ def forward( causal_attention_mask: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutput]: + ) -> BaseModelOutput: r""" Args: inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): @@ -391,7 +388,6 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict encoder_states = () if output_hidden_states else None all_attentions = () if output_attentions else None @@ -424,8 +420,6 @@ def forward( if output_hidden_states: encoder_states = encoder_states + (hidden_states,) - if not return_dict: - return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) return BaseModelOutput( last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions ) @@ -444,14 +438,14 @@ def __init__(self, config: IdeficsVisionConfig): self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) # Adapted from transformers.models.clip.modeling_clip.CLIPVisionTransformer.forward + @can_return_tuple def forward( self, pixel_values: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, interpolate_pos_encoding: Optional[bool] = False, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPooling]: + ) -> BaseModelOutputWithPooling: r""" Returns: @@ -460,7 +454,6 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -472,16 +465,12 @@ def forward( inputs_embeds=hidden_states, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) last_hidden_state = encoder_outputs[0] pooled_output = last_hidden_state[:, 0, :] pooled_output = self.post_layernorm(pooled_output) - if not return_dict: - return (last_hidden_state, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPooling( last_hidden_state=last_hidden_state, pooler_output=pooled_output, diff --git a/src/transformers/models/kosmos2/modeling_kosmos2.py b/src/transformers/models/kosmos2/modeling_kosmos2.py index e557293ee370..7cb6657b2cb3 100644 --- a/src/transformers/models/kosmos2/modeling_kosmos2.py +++ b/src/transformers/models/kosmos2/modeling_kosmos2.py @@ -16,7 +16,7 @@ import math from dataclasses import dataclass -from typing import Any, Callable, List, Optional, Tuple, Union +from typing import Any, Callable, List, Optional, Tuple import torch import torch.utils.checkpoint @@ -36,6 +36,7 @@ ModelOutput, add_start_docstrings, add_start_docstrings_to_model_forward, + can_return_tuple, logging, replace_return_docstrings, torch_int, @@ -651,6 +652,7 @@ def __init__(self, config: Kosmos2VisionConfig): self.layers = nn.ModuleList([Kosmos2VisionEncoderLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False + @can_return_tuple def forward( self, inputs_embeds, @@ -658,8 +660,7 @@ def forward( causal_attention_mask: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutput]: + ) -> BaseModelOutput: r""" Args: inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): @@ -693,7 +694,6 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict encoder_states = () if output_hidden_states else None all_attentions = () if output_attentions else None @@ -726,8 +726,6 @@ def forward( if output_hidden_states: encoder_states = encoder_states + (hidden_states,) - if not return_dict: - return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) return BaseModelOutput( last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions ) @@ -746,19 +744,18 @@ def __init__(self, config: Kosmos2VisionConfig): self.encoder = Kosmos2VisionEncoder(config) self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) + @can_return_tuple def forward( self, pixel_values: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, interpolate_pos_encoding: bool = False, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPooling]: + ) -> BaseModelOutputWithPooling: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if pixel_values is None: raise ValueError("You have to specify pixel_values") @@ -766,20 +763,16 @@ def forward( hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding) hidden_states = self.pre_layrnorm(hidden_states) - encoder_outputs = self.encoder( + encoder_outputs: BaseModelOutput = self.encoder( inputs_embeds=hidden_states, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) last_hidden_state = encoder_outputs[0] pooled_output = last_hidden_state[:, 0, :] pooled_output = self.post_layernorm(pooled_output) - if not return_dict: - return (last_hidden_state, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPooling( last_hidden_state=last_hidden_state, pooler_output=pooled_output, @@ -1216,6 +1209,7 @@ def forward_embedding( return hidden_states + @can_return_tuple def forward( self, input_ids: Optional[torch.Tensor] = None, @@ -1232,14 +1226,12 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]: + ) -> BaseModelOutputWithPastAndCrossAttentions: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -1357,18 +1349,6 @@ def forward( if output_hidden_states: all_hidden_states += (hidden_states,) - if not return_dict: - return tuple( - v - for v in [ - hidden_states, - present_key_value_states, - all_hidden_states, - all_self_attns, - all_cross_attentions, - ] - if v is not None - ) return BaseModelOutputWithPastAndCrossAttentions( last_hidden_state=hidden_states, past_key_values=present_key_value_states, @@ -1487,6 +1467,7 @@ def __init__(self, config: Kosmos2VisionConfig): def get_input_embeddings(self) -> nn.Module: return self.model.embeddings.patch_embedding + @can_return_tuple @add_start_docstrings_to_model_forward(KOSMOS2_VISION_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=Kosmos2VisionConfig) def forward( @@ -1495,8 +1476,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, interpolate_pos_encoding: bool = False, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPooling]: + ) -> BaseModelOutputWithPooling: r""" Returns: @@ -1506,7 +1486,6 @@ def forward( output_attentions=output_attentions, output_hidden_states=output_hidden_states, interpolate_pos_encoding=interpolate_pos_encoding, - return_dict=return_dict, ) @@ -1525,6 +1504,7 @@ def get_input_embeddings(self) -> nn.Module: def set_input_embeddings(self, value): self.model.embed_tokens = value + @can_return_tuple @add_start_docstrings_to_model_forward(KOSMOS2_TEXT_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=BaseModelOutputWithPastAndCrossAttentions, config_class=Kosmos2TextConfig) def forward( @@ -1543,8 +1523,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]: + ) -> BaseModelOutputWithPastAndCrossAttentions: r""" Returns: @@ -1564,7 +1543,6 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) @@ -1600,6 +1578,7 @@ def get_output_embeddings(self) -> nn.Module: def set_output_embeddings(self, new_embeddings): self.lm_head = new_embeddings + @can_return_tuple @add_start_docstrings_to_model_forward(KOSMOS2_TEXT_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=Kosmos2TextConfig) def forward( @@ -1619,8 +1598,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]: + ) -> CausalLMOutputWithCrossAttentions: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in @@ -1630,14 +1608,13 @@ def forward( Returns: """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if labels is not None: if use_cache: logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.") use_cache = False - outputs = self.model( + outputs: BaseModelOutputWithPastAndCrossAttentions = self.model( input_ids=input_ids, attention_mask=attention_mask, image_embeds=image_embeds, @@ -1652,9 +1629,8 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - lm_logits = self.lm_head(outputs[0]) + lm_logits = self.lm_head(outputs.last_hidden_state) loss = None if labels is not None: @@ -1670,10 +1646,6 @@ def forward( shift_logits.view(batch_size * seq_length, vocab_size), shift_labels.view(batch_size * seq_length) ) - if not return_dict: - output = (lm_logits,) + outputs[1:] - return (loss,) + output if loss is not None else output - return CausalLMOutputWithCrossAttentions( loss=loss, logits=lm_logits, @@ -1804,6 +1776,7 @@ def get_input_embeddings(self) -> nn.Module: def set_input_embeddings(self, value): self.text_model.model.embed_tokens = value + @can_return_tuple @add_start_docstrings_to_model_forward(KOSMOS2_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=Kosmos2ModelOutput, config_class=_CONFIG_FOR_DOC) def forward( @@ -1821,8 +1794,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, interpolate_pos_encoding: bool = False, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, Kosmos2ModelOutput]: + ) -> Kosmos2ModelOutput: r""" Returns: @@ -1860,7 +1832,6 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict vision_model_output = None projection_attentions = None @@ -1868,20 +1839,19 @@ def forward( if pixel_values is None: raise ValueError("You have to specify either `pixel_values` or `image_embeds`.") - vision_model_output = self.vision_model( + vision_model_output: BaseModelOutputWithPooling = self.vision_model( pixel_values=pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, interpolate_pos_encoding=interpolate_pos_encoding, - return_dict=return_dict, ) # The whole `last_hidden_state` through `post_layernorm` instead of just `pooled_output`. - image_embeds = self.vision_model.model.post_layernorm(vision_model_output[0]) + image_embeds = self.vision_model.model.post_layernorm(vision_model_output.last_hidden_state) # normalized features image_embeds = nn.functional.normalize(image_embeds, dim=-1) image_embeds, projection_attentions = self.image_to_text_projection(image_embeds) - outputs = self.text_model( + outputs: BaseModelOutputWithPastAndCrossAttentions = self.text_model( input_ids=input_ids, attention_mask=attention_mask, image_embeds=image_embeds, @@ -1893,13 +1863,8 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - if not return_dict: - outputs = outputs + (image_embeds, projection_attentions, vision_model_output) - return tuple(output for output in outputs if output is not None) - return Kosmos2ModelOutput( last_hidden_state=outputs.last_hidden_state, past_key_values=outputs.past_key_values, @@ -1946,6 +1911,7 @@ def get_output_embeddings(self) -> nn.Module: def set_output_embeddings(self, new_embeddings): self.text_model.set_output_embeddings(new_embeddings) + @can_return_tuple @add_start_docstrings_to_model_forward(KOSMOS2_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=Kosmos2ForConditionalGenerationModelOutput, config_class=_CONFIG_FOR_DOC) def forward( @@ -1963,8 +1929,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, Kosmos2ForConditionalGenerationModelOutput]: + ) -> Kosmos2ForConditionalGenerationModelOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in @@ -2015,7 +1980,6 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict vision_model_output = None projection_attentions = None @@ -2023,19 +1987,17 @@ def forward( if pixel_values is None: raise ValueError("You have to specify either `pixel_values` or `image_embeds`.") - vision_model_output = self.vision_model( + vision_model_output: BaseModelOutputWithPooling = self.vision_model( pixel_values=pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) # The whole `last_hidden_state` through `post_layernorm` instead of just `pooled_output`. - image_embeds = self.vision_model.model.post_layernorm(vision_model_output[0]) - # normalized features + image_embeds = self.vision_model.model.post_layernorm(vision_model_output.last_hidden_state) image_embeds = nn.functional.normalize(image_embeds, dim=-1) image_embeds, projection_attentions = self.image_to_text_projection(image_embeds) - lm_outputs = self.text_model( + lm_outputs: CausalLMOutputWithCrossAttentions = self.text_model( input_ids=input_ids, attention_mask=attention_mask, image_embeds=image_embeds, @@ -2048,13 +2010,8 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - if not return_dict: - outputs = lm_outputs + (image_embeds, projection_attentions, vision_model_output) - return tuple(output for output in outputs if output is not None) - return Kosmos2ForConditionalGenerationModelOutput( loss=lm_outputs.loss, logits=lm_outputs.logits, diff --git a/src/transformers/models/layoutlm/modeling_layoutlm.py b/src/transformers/models/layoutlm/modeling_layoutlm.py index 8c31521a3f6d..5d48981925c2 100644 --- a/src/transformers/models/layoutlm/modeling_layoutlm.py +++ b/src/transformers/models/layoutlm/modeling_layoutlm.py @@ -15,7 +15,7 @@ """PyTorch LayoutLM model.""" import math -from typing import Optional, Tuple, Union +from typing import Optional, Tuple import torch import torch.utils.checkpoint @@ -33,7 +33,13 @@ ) from ...modeling_utils import PreTrainedModel from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer -from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings +from ...utils import ( + add_start_docstrings, + add_start_docstrings_to_model_forward, + can_return_tuple, + logging, + replace_return_docstrings, +) from .configuration_layoutlm import LayoutLMConfig @@ -455,6 +461,7 @@ def __init__(self, config): self.layer = nn.ModuleList([LayoutLMLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False + @can_return_tuple def forward( self, hidden_states: torch.Tensor, @@ -466,8 +473,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = False, output_hidden_states: Optional[bool] = False, - return_dict: Optional[bool] = True, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: + ) -> BaseModelOutputWithPastAndCrossAttentions: all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None @@ -520,18 +526,6 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - if not return_dict: - return tuple( - v - for v in [ - hidden_states, - next_decoder_cache, - all_hidden_states, - all_self_attentions, - all_cross_attentions, - ] - if v is not None - ) return BaseModelOutputWithPastAndCrossAttentions( last_hidden_state=hidden_states, past_key_values=next_decoder_cache, @@ -731,6 +725,7 @@ class PreTrainedModel for layer, heads in heads_to_prune.items(): self.encoder.layer[layer].attention.prune_heads(heads) + @can_return_tuple @add_start_docstrings_to_model_forward(LAYOUTLM_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=BaseModelOutputWithPoolingAndCrossAttentions, config_class=_CONFIG_FOR_DOC) def forward( @@ -746,8 +741,7 @@ def forward( encoder_attention_mask: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPoolingAndCrossAttentions]: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: r""" Returns: @@ -786,7 +780,6 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -830,20 +823,16 @@ def forward( token_type_ids=token_type_ids, inputs_embeds=inputs_embeds, ) - encoder_outputs = self.encoder( + encoder_outputs: BaseModelOutputWithPastAndCrossAttentions = self.encoder( embedding_output, extended_attention_mask, head_mask=head_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = encoder_outputs[0] + sequence_output = encoder_outputs.last_hidden_state pooled_output = self.pooler(sequence_output) - if not return_dict: - return (sequence_output, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPoolingAndCrossAttentions( last_hidden_state=sequence_output, pooler_output=pooled_output, @@ -876,6 +865,7 @@ def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings self.cls.predictions.bias = new_embeddings.bias + @can_return_tuple @add_start_docstrings_to_model_forward(LAYOUTLM_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC) def forward( @@ -892,8 +882,7 @@ def forward( encoder_attention_mask: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, MaskedLMOutput]: + ) -> MaskedLMOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., @@ -939,9 +928,8 @@ def forward( >>> loss = outputs.loss ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.layoutlm( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.layoutlm( input_ids, bbox, attention_mask=attention_mask, @@ -953,10 +941,9 @@ def forward( encoder_attention_mask=encoder_attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state prediction_scores = self.cls(sequence_output) masked_lm_loss = None @@ -967,10 +954,6 @@ def forward( labels.view(-1), ) - if not return_dict: - output = (prediction_scores,) + outputs[2:] - return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output - return MaskedLMOutput( loss=masked_lm_loss, logits=prediction_scores, @@ -1000,6 +983,7 @@ def __init__(self, config): def get_input_embeddings(self): return self.layoutlm.embeddings.word_embeddings + @can_return_tuple @add_start_docstrings_to_model_forward(LAYOUTLM_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=SequenceClassifierOutput, config_class=_CONFIG_FOR_DOC) def forward( @@ -1014,8 +998,7 @@ def forward( labels: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, SequenceClassifierOutput]: + ) -> SequenceClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., @@ -1061,9 +1044,8 @@ def forward( >>> loss = outputs.loss >>> logits = outputs.logits ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.layoutlm( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.layoutlm( input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, @@ -1073,11 +1055,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - pooled_output = outputs[1] - + pooled_output = outputs.pooler_output pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) @@ -1103,9 +1083,6 @@ def forward( elif self.config.problem_type == "multi_label_classification": loss_fct = BCEWithLogitsLoss() loss = loss_fct(logits, labels) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output return SequenceClassifierOutput( loss=loss, @@ -1137,6 +1114,7 @@ def __init__(self, config): def get_input_embeddings(self): return self.layoutlm.embeddings.word_embeddings + @can_return_tuple @add_start_docstrings_to_model_forward(LAYOUTLM_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=TokenClassifierOutput, config_class=_CONFIG_FOR_DOC) def forward( @@ -1151,8 +1129,7 @@ def forward( labels: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, TokenClassifierOutput]: + ) -> TokenClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. @@ -1196,9 +1173,8 @@ def forward( >>> loss = outputs.loss >>> logits = outputs.logits ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.layoutlm( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.layoutlm( input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, @@ -1208,11 +1184,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] - + sequence_output = outputs.last_hidden_state sequence_output = self.dropout(sequence_output) logits = self.classifier(sequence_output) @@ -1221,10 +1195,6 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return TokenClassifierOutput( loss=loss, logits=logits, @@ -1255,6 +1225,7 @@ def __init__(self, config, has_visual_segment_embedding=True): def get_input_embeddings(self): return self.layoutlm.embeddings.word_embeddings + @can_return_tuple @replace_return_docstrings(output_type=QuestionAnsweringModelOutput, config_class=_CONFIG_FOR_DOC) def forward( self, @@ -1269,8 +1240,7 @@ def forward( end_positions: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, QuestionAnsweringModelOutput]: + ) -> QuestionAnsweringModelOutput: r""" start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for position (index) of the start of the labelled span for computing the token classification loss. @@ -1325,9 +1295,7 @@ def forward( M. Hamann P. Harper, P. Martinez ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.layoutlm( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.layoutlm( input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, @@ -1337,11 +1305,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] - + sequence_output = outputs.last_hidden_state logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) start_logits = start_logits.squeeze(-1).contiguous() @@ -1364,10 +1330,6 @@ def forward( end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 - if not return_dict: - output = (start_logits, end_logits) + outputs[2:] - return ((total_loss,) + output) if total_loss is not None else output - return QuestionAnsweringModelOutput( loss=total_loss, start_logits=start_logits, diff --git a/src/transformers/models/markuplm/modeling_markuplm.py b/src/transformers/models/markuplm/modeling_markuplm.py index f47483d9d861..137704a33d44 100755 --- a/src/transformers/models/markuplm/modeling_markuplm.py +++ b/src/transformers/models/markuplm/modeling_markuplm.py @@ -43,7 +43,7 @@ find_pruneable_heads_and_indices, prune_linear_layer, ) -from ...utils import logging +from ...utils import can_return_tuple, logging from .configuration_markuplm import MarkupLMConfig @@ -620,6 +620,7 @@ def __init__(self, config): self.layer = nn.ModuleList([MarkupLMLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False + @can_return_tuple def forward( self, hidden_states: torch.Tensor, @@ -631,8 +632,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = False, output_hidden_states: Optional[bool] = False, - return_dict: Optional[bool] = True, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: + ) -> BaseModelOutputWithPastAndCrossAttentions: all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None @@ -685,18 +685,6 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - if not return_dict: - return tuple( - v - for v in [ - hidden_states, - next_decoder_cache, - all_hidden_states, - all_self_attentions, - all_cross_attentions, - ] - if v is not None - ) return BaseModelOutputWithPastAndCrossAttentions( last_hidden_state=hidden_states, past_key_values=next_decoder_cache, @@ -833,6 +821,7 @@ class PreTrainedModel for layer, heads in heads_to_prune.items(): self.encoder.layer[layer].attention.prune_heads(heads) + @can_return_tuple @add_start_docstrings_to_model_forward(MARKUPLM_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=BaseModelOutputWithPoolingAndCrossAttentions, config_class=_CONFIG_FOR_DOC) def forward( @@ -847,8 +836,7 @@ def forward( inputs_embeds: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPoolingAndCrossAttentions]: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: r""" Returns: @@ -873,7 +861,6 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -915,21 +902,17 @@ def forward( token_type_ids=token_type_ids, inputs_embeds=inputs_embeds, ) - encoder_outputs = self.encoder( + encoder_outputs: BaseModelOutputWithPastAndCrossAttentions = self.encoder( embedding_output, extended_attention_mask, head_mask=head_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = encoder_outputs[0] + sequence_output = encoder_outputs.last_hidden_state pooled_output = self.pooler(sequence_output) if self.pooler is not None else None - if not return_dict: - return (sequence_output, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPoolingAndCrossAttentions( last_hidden_state=sequence_output, pooler_output=pooled_output, @@ -967,6 +950,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(MARKUPLM_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=QuestionAnsweringModelOutput, config_class=_CONFIG_FOR_DOC) def forward( @@ -983,8 +967,7 @@ def forward( end_positions: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]: + ) -> QuestionAnsweringModelOutput: r""" start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for position (index) of the start of the labelled span for computing the token classification loss. @@ -1021,9 +1004,8 @@ def forward( >>> processor.decode(predict_answer_tokens).strip() 'Niels' ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.markuplm( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.markuplm( input_ids, xpath_tags_seq=xpath_tags_seq, xpath_subs_seq=xpath_subs_seq, @@ -1034,10 +1016,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) @@ -1061,10 +1042,6 @@ def forward( end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 - if not return_dict: - output = (start_logits, end_logits) + outputs[2:] - return ((total_loss,) + output) if total_loss is not None else output - return QuestionAnsweringModelOutput( loss=total_loss, start_logits=start_logits, @@ -1091,6 +1068,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(MARKUPLM_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC) def forward( @@ -1106,8 +1084,7 @@ def forward( labels: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]: + ) -> TokenClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. @@ -1135,9 +1112,8 @@ def forward( >>> loss = outputs.loss >>> logits = outputs.logits ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.markuplm( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.markuplm( input_ids, xpath_tags_seq=xpath_tags_seq, xpath_subs_seq=xpath_subs_seq, @@ -1148,10 +1124,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state prediction_scores = self.classifier(sequence_output) # (batch_size, seq_length, node_type_size) loss = None @@ -1162,10 +1137,6 @@ def forward( labels.view(-1), ) - if not return_dict: - output = (prediction_scores,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return TokenClassifierOutput( loss=loss, logits=prediction_scores, @@ -1198,6 +1169,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(MARKUPLM_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=SequenceClassifierOutput, config_class=_CONFIG_FOR_DOC) def forward( @@ -1213,8 +1185,7 @@ def forward( labels: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]: + ) -> SequenceClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., @@ -1241,9 +1212,8 @@ def forward( >>> loss = outputs.loss >>> logits = outputs.logits ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.markuplm( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.markuplm( input_ids, xpath_tags_seq=xpath_tags_seq, xpath_subs_seq=xpath_subs_seq, @@ -1254,11 +1224,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - pooled_output = outputs[1] - + pooled_output = outputs.pooler_output pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) @@ -1284,9 +1252,6 @@ def forward( elif self.config.problem_type == "multi_label_classification": loss_fct = BCEWithLogitsLoss() loss = loss_fct(logits, labels) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output return SequenceClassifierOutput( loss=loss, diff --git a/src/transformers/models/mobilebert/modeling_mobilebert.py b/src/transformers/models/mobilebert/modeling_mobilebert.py index b1d269055e65..7a8fbd06d72b 100644 --- a/src/transformers/models/mobilebert/modeling_mobilebert.py +++ b/src/transformers/models/mobilebert/modeling_mobilebert.py @@ -24,7 +24,7 @@ import os import warnings from dataclasses import dataclass -from typing import Optional, Tuple, Union +from typing import Optional, Tuple import torch from torch import nn @@ -48,6 +48,7 @@ add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, + can_return_tuple, logging, replace_return_docstrings, ) @@ -562,6 +563,7 @@ def __init__(self, config): super().__init__() self.layer = nn.ModuleList([MobileBertLayer(config) for _ in range(config.num_hidden_layers)]) + @can_return_tuple def forward( self, hidden_states: torch.Tensor, @@ -569,8 +571,7 @@ def forward( head_mask: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = False, output_hidden_states: Optional[bool] = False, - return_dict: Optional[bool] = True, - ) -> Union[Tuple, BaseModelOutput]: + ) -> BaseModelOutput: all_hidden_states = () if output_hidden_states else None all_attentions = () if output_attentions else None for i, layer_module in enumerate(self.layer): @@ -592,8 +593,6 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - if not return_dict: - return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None) return BaseModelOutput( last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions ) @@ -842,6 +841,7 @@ class PreTrainedModel for layer, heads in heads_to_prune.items(): self.encoder.layer[layer].attention.prune_heads(heads) + @can_return_tuple @add_start_docstrings_to_model_forward(MOBILEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -858,13 +858,11 @@ def forward( inputs_embeds: Optional[torch.FloatTensor] = None, output_hidden_states: Optional[bool] = None, output_attentions: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPooling]: + ) -> BaseModelOutputWithPooling: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -897,20 +895,16 @@ def forward( embedding_output = self.embeddings( input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds ) - encoder_outputs = self.encoder( + encoder_outputs: BaseModelOutput = self.encoder( embedding_output, attention_mask=extended_attention_mask, head_mask=head_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = encoder_outputs[0] + sequence_output = encoder_outputs.last_hidden_state pooled_output = self.pooler(sequence_output) if self.pooler is not None else None - if not return_dict: - return (sequence_output, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPooling( last_hidden_state=sequence_output, pooler_output=pooled_output, @@ -952,6 +946,7 @@ def resize_token_embeddings(self, new_num_tokens: Optional[int] = None) -> nn.Em return super().resize_token_embeddings(new_num_tokens=new_num_tokens) + @can_return_tuple @add_start_docstrings_to_model_forward(MOBILEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=MobileBertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) def forward( @@ -966,8 +961,7 @@ def forward( next_sentence_label: Optional[torch.LongTensor] = None, output_attentions: Optional[torch.FloatTensor] = None, output_hidden_states: Optional[torch.FloatTensor] = None, - return_dict: Optional[torch.FloatTensor] = None, - ) -> Union[Tuple, MobileBertForPreTrainingOutput]: + ) -> MobileBertForPreTrainingOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., @@ -998,9 +992,8 @@ def forward( >>> prediction_logits = outputs.prediction_logits >>> seq_relationship_logits = outputs.seq_relationship_logits ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.mobilebert( + outputs: BaseModelOutputWithPooling = self.mobilebert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1009,9 +1002,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output, pooled_output = outputs[:2] + sequence_output = outputs.last_hidden_state + pooled_output = outputs.pooler_output prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output) total_loss = None @@ -1021,10 +1014,6 @@ def forward( next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) total_loss = masked_lm_loss + next_sentence_loss - if not return_dict: - output = (prediction_scores, seq_relationship_score) + outputs[2:] - return ((total_loss,) + output) if total_loss is not None else output - return MobileBertForPreTrainingOutput( loss=total_loss, prediction_logits=prediction_scores, @@ -1061,6 +1050,7 @@ def resize_token_embeddings(self, new_num_tokens: Optional[int] = None) -> nn.Em ) return super().resize_token_embeddings(new_num_tokens=new_num_tokens) + @can_return_tuple @add_start_docstrings_to_model_forward(MOBILEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -1080,17 +1070,15 @@ def forward( labels: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, MaskedLMOutput]: + ) -> MaskedLMOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.mobilebert( + outputs: BaseModelOutputWithPooling = self.mobilebert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1099,10 +1087,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state prediction_scores = self.cls(sequence_output) masked_lm_loss = None @@ -1110,10 +1097,6 @@ def forward( loss_fct = CrossEntropyLoss() # -100 index = padding token masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) - if not return_dict: - output = (prediction_scores,) + outputs[2:] - return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output - return MaskedLMOutput( loss=masked_lm_loss, logits=prediction_scores, @@ -1146,6 +1129,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(MOBILEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=NextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC) def forward( @@ -1159,9 +1143,8 @@ def forward( labels: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, **kwargs, - ) -> Union[Tuple, NextSentencePredictorOutput]: + ) -> NextSentencePredictorOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair @@ -1198,9 +1181,7 @@ def forward( ) labels = kwargs.pop("next_sentence_label") - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.mobilebert( + outputs: BaseModelOutputWithPooling = self.mobilebert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1209,10 +1190,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - pooled_output = outputs[1] + pooled_output = outputs.pooler_output seq_relationship_score = self.cls(pooled_output) next_sentence_loss = None @@ -1220,10 +1200,6 @@ def forward( loss_fct = CrossEntropyLoss() next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), labels.view(-1)) - if not return_dict: - output = (seq_relationship_score,) + outputs[2:] - return ((next_sentence_loss,) + output) if next_sentence_loss is not None else output - return NextSentencePredictorOutput( loss=next_sentence_loss, logits=seq_relationship_score, @@ -1239,7 +1215,7 @@ def forward( """, MOBILEBERT_START_DOCSTRING, ) -# Copied from transformers.models.bert.modeling_bert.BertForSequenceClassification with Bert->MobileBert all-casing +# Copied from transformers.models.bert.modeling_bert.BertForSequenceClassification with Bert->MobileBert all-casing, BaseModelOutputWithPoolingAndCrossAttentions->BaseModelOutputWithPooling class MobileBertForSequenceClassification(MobileBertPreTrainedModel): def __init__(self, config): super().__init__(config) @@ -1256,6 +1232,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(MOBILEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION, @@ -1275,17 +1252,14 @@ def forward( labels: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]: + ) -> SequenceClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.mobilebert( + outputs: BaseModelOutputWithPooling = self.mobilebert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1294,11 +1268,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - pooled_output = outputs[1] - + pooled_output = outputs.pooler_output pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) @@ -1324,9 +1296,6 @@ def forward( elif self.config.problem_type == "multi_label_classification": loss_fct = BCEWithLogitsLoss() loss = loss_fct(logits, labels) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output return SequenceClassifierOutput( loss=loss, @@ -1343,7 +1312,7 @@ def forward( """, MOBILEBERT_START_DOCSTRING, ) -# Copied from transformers.models.bert.modeling_bert.BertForQuestionAnswering with Bert->MobileBert all-casing +# Copied from transformers.models.bert.modeling_bert.BertForQuestionAnswering with Bert->MobileBert all-casing, BaseModelOutputWithPoolingAndCrossAttentions->BaseModelOutputWithPooling class MobileBertForQuestionAnswering(MobileBertPreTrainedModel): def __init__(self, config): super().__init__(config) @@ -1355,6 +1324,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(MOBILEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_QA, @@ -1377,8 +1347,7 @@ def forward( end_positions: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]: + ) -> QuestionAnsweringModelOutput: r""" start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for position (index) of the start of the labelled span for computing the token classification loss. @@ -1389,9 +1358,7 @@ def forward( Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.mobilebert( + outputs: BaseModelOutputWithPooling = self.mobilebert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1400,11 +1367,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] - + sequence_output = outputs.last_hidden_state logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) start_logits = start_logits.squeeze(-1).contiguous() @@ -1427,10 +1392,6 @@ def forward( end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 - if not return_dict: - output = (start_logits, end_logits) + outputs[2:] - return ((total_loss,) + output) if total_loss is not None else output - return QuestionAnsweringModelOutput( loss=total_loss, start_logits=start_logits, @@ -1447,7 +1408,7 @@ def forward( """, MOBILEBERT_START_DOCSTRING, ) -# Copied from transformers.models.bert.modeling_bert.BertForMultipleChoice with Bert->MobileBert all-casing +# Copied from transformers.models.bert.modeling_bert.BertForMultipleChoice with Bert->MobileBert all-casing, BaseModelOutputWithPoolingAndCrossAttentions->BaseModelOutputWithPooling class MobileBertForMultipleChoice(MobileBertPreTrainedModel): def __init__(self, config): super().__init__(config) @@ -1462,6 +1423,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward( MOBILEBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length") ) @@ -1481,15 +1443,13 @@ def forward( labels: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]: + ) -> MultipleChoiceModelOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above) """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None @@ -1502,7 +1462,7 @@ def forward( else None ) - outputs = self.mobilebert( + outputs: BaseModelOutputWithPooling = self.mobilebert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1511,11 +1471,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - pooled_output = outputs[1] - + pooled_output = outputs.pooler_output pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) reshaped_logits = logits.view(-1, num_choices) @@ -1525,10 +1483,6 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(reshaped_logits, labels) - if not return_dict: - output = (reshaped_logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return MultipleChoiceModelOutput( loss=loss, logits=reshaped_logits, @@ -1544,7 +1498,7 @@ def forward( """, MOBILEBERT_START_DOCSTRING, ) -# Copied from transformers.models.bert.modeling_bert.BertForTokenClassification with Bert->MobileBert all-casing +# Copied from transformers.models.bert.modeling_bert.BertForTokenClassification with Bert->MobileBert all-casing, BaseModelOutputWithPoolingAndCrossAttentions->BaseModelOutputWithPooling class MobileBertForTokenClassification(MobileBertPreTrainedModel): def __init__(self, config): super().__init__(config) @@ -1560,6 +1514,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(MOBILEBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_TOKEN_CLASSIFICATION, @@ -1579,15 +1534,12 @@ def forward( labels: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]: + ) -> TokenClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.mobilebert( + outputs: BaseModelOutputWithPooling = self.mobilebert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1596,11 +1548,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] - + sequence_output = outputs.last_hidden_state sequence_output = self.dropout(sequence_output) logits = self.classifier(sequence_output) @@ -1609,10 +1559,6 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return TokenClassifierOutput( loss=loss, logits=logits, diff --git a/src/transformers/models/roberta/modeling_roberta.py b/src/transformers/models/roberta/modeling_roberta.py index f2dfa19a6a50..1ee772d9c84d 100644 --- a/src/transformers/models/roberta/modeling_roberta.py +++ b/src/transformers/models/roberta/modeling_roberta.py @@ -16,7 +16,7 @@ """PyTorch RoBERTa model.""" import math -from typing import List, Optional, Tuple, Union +from typing import List, Optional, Tuple import torch import torch.utils.checkpoint @@ -46,6 +46,7 @@ add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, + can_return_tuple, get_torch_version, logging, replace_return_docstrings, @@ -584,6 +585,7 @@ def __init__(self, config): self.layer = nn.ModuleList([RobertaLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False + @can_return_tuple def forward( self, hidden_states: torch.Tensor, @@ -595,8 +597,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = False, output_hidden_states: Optional[bool] = False, - return_dict: Optional[bool] = True, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: + ) -> BaseModelOutputWithPastAndCrossAttentions: all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None @@ -649,18 +650,6 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - if not return_dict: - return tuple( - v - for v in [ - hidden_states, - next_decoder_cache, - all_hidden_states, - all_self_attentions, - all_cross_attentions, - ] - if v is not None - ) return BaseModelOutputWithPastAndCrossAttentions( last_hidden_state=hidden_states, past_key_values=next_decoder_cache, @@ -834,6 +823,7 @@ class PreTrainedModel for layer, heads in heads_to_prune.items(): self.encoder.layer[layer].attention.prune_heads(heads) + @can_return_tuple @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -854,8 +844,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: r""" encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if @@ -880,7 +869,6 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if self.config.is_decoder: use_cache = use_cache if use_cache is not None else self.config.use_cache @@ -975,7 +963,7 @@ def forward( # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) - encoder_outputs = self.encoder( + encoder_outputs: BaseModelOutputWithPastAndCrossAttentions = self.encoder( embedding_output, attention_mask=extended_attention_mask, head_mask=head_mask, @@ -985,14 +973,10 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = encoder_outputs[0] + sequence_output = encoder_outputs.last_hidden_state pooled_output = self.pooler(sequence_output) if self.pooler is not None else None - if not return_dict: - return (sequence_output, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPoolingAndCrossAttentions( last_hidden_state=sequence_output, pooler_output=pooled_output, @@ -1027,6 +1011,7 @@ def get_output_embeddings(self): def set_output_embeddings(self, new_embeddings): self.lm_head.decoder = new_embeddings + @can_return_tuple @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC) def forward( @@ -1044,9 +1029,8 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, **kwargs, - ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]: + ) -> CausalLMOutputWithCrossAttentions: r""" encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if @@ -1090,11 +1074,10 @@ def forward( >>> prediction_logits = outputs.logits ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if labels is not None: use_cache = False - outputs = self.roberta( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1107,10 +1090,9 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state prediction_scores = self.lm_head(sequence_output) lm_loss = None @@ -1124,10 +1106,6 @@ def forward( **kwargs, ) - if not return_dict: - output = (prediction_scores,) + outputs[2:] - return ((lm_loss,) + output) if lm_loss is not None else output - return CausalLMOutputWithCrossAttentions( loss=lm_loss, logits=prediction_scores, @@ -1171,6 +1149,7 @@ def get_output_embeddings(self): def set_output_embeddings(self, new_embeddings): self.lm_head.decoder = new_embeddings + @can_return_tuple @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -1193,8 +1172,7 @@ def forward( labels: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]: + ) -> MaskedLMOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., @@ -1203,9 +1181,7 @@ def forward( kwargs (`Dict[str, any]`, *optional*, defaults to `{}`): Used to hide legacy arguments that have been deprecated. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.roberta( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1216,9 +1192,8 @@ def forward( encoder_attention_mask=encoder_attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state prediction_scores = self.lm_head(sequence_output) masked_lm_loss = None @@ -1228,10 +1203,6 @@ def forward( loss_fct = CrossEntropyLoss() masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) - if not return_dict: - output = (prediction_scores,) + outputs[2:] - return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output - return MaskedLMOutput( loss=masked_lm_loss, logits=prediction_scores, @@ -1290,6 +1261,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint="cardiffnlp/twitter-roberta-base-emotion", @@ -1309,17 +1281,14 @@ def forward( labels: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]: + ) -> SequenceClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.roberta( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1328,9 +1297,8 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state logits = self.classifier(sequence_output) loss = None @@ -1358,10 +1326,6 @@ def forward( loss_fct = BCEWithLogitsLoss() loss = loss_fct(logits, labels) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return SequenceClassifierOutput( loss=loss, logits=logits, @@ -1388,6 +1352,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -1405,15 +1370,13 @@ def forward( inputs_embeds: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]: + ) -> MultipleChoiceModelOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above) """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None @@ -1426,7 +1389,7 @@ def forward( else None ) - outputs = self.roberta( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roberta( flat_input_ids, position_ids=flat_position_ids, token_type_ids=flat_token_type_ids, @@ -1435,9 +1398,8 @@ def forward( inputs_embeds=flat_inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - pooled_output = outputs[1] + pooled_output = outputs.pooler_output pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) @@ -1450,10 +1412,6 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(reshaped_logits, labels) - if not return_dict: - output = (reshaped_logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return MultipleChoiceModelOutput( loss=loss, logits=reshaped_logits, @@ -1484,6 +1442,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint="Jean-Baptiste/roberta-large-ner-english", @@ -1503,15 +1462,12 @@ def forward( labels: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]: + ) -> TokenClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.roberta( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1520,11 +1476,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] - + sequence_output = outputs.last_hidden_state sequence_output = self.dropout(sequence_output) logits = self.classifier(sequence_output) @@ -1535,10 +1489,6 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return TokenClassifierOutput( loss=loss, logits=logits, @@ -1587,6 +1537,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint="deepset/roberta-base-squad2", @@ -1607,8 +1558,7 @@ def forward( end_positions: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]: + ) -> QuestionAnsweringModelOutput: r""" start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for position (index) of the start of the labelled span for computing the token classification loss. @@ -1619,9 +1569,7 @@ def forward( Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.roberta( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1630,11 +1578,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] - + sequence_output = outputs.last_hidden_state logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) start_logits = start_logits.squeeze(-1).contiguous() @@ -1657,10 +1603,6 @@ def forward( end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 - if not return_dict: - output = (start_logits, end_logits) + outputs[2:] - return ((total_loss,) + output) if total_loss is not None else output - return QuestionAnsweringModelOutput( loss=total_loss, start_logits=start_logits, diff --git a/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py b/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py index 6b0c40b222c1..56d42ac1bdcd 100644 --- a/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +++ b/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py @@ -16,7 +16,7 @@ """PyTorch RoBERTa-PreLayerNorm model.""" import math -from typing import List, Optional, Tuple, Union +from typing import List, Optional, Tuple import torch import torch.utils.checkpoint @@ -41,6 +41,7 @@ add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, + can_return_tuple, logging, replace_return_docstrings, ) @@ -468,6 +469,7 @@ def __init__(self, config): self.layer = nn.ModuleList([RobertaPreLayerNormLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False + @can_return_tuple def forward( self, hidden_states: torch.Tensor, @@ -479,8 +481,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = False, output_hidden_states: Optional[bool] = False, - return_dict: Optional[bool] = True, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: + ) -> BaseModelOutputWithPastAndCrossAttentions: all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None @@ -533,18 +534,6 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - if not return_dict: - return tuple( - v - for v in [ - hidden_states, - next_decoder_cache, - all_hidden_states, - all_self_attentions, - all_cross_attentions, - ] - if v is not None - ) return BaseModelOutputWithPastAndCrossAttentions( last_hidden_state=hidden_states, past_key_values=next_decoder_cache, @@ -715,6 +704,7 @@ class PreTrainedModel for layer, heads in heads_to_prune.items(): self.encoder.layer[layer].attention.prune_heads(heads) + @can_return_tuple @add_start_docstrings_to_model_forward(ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -735,8 +725,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: r""" encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if @@ -761,7 +750,6 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if self.config.is_decoder: use_cache = use_cache if use_cache is not None else self.config.use_cache @@ -824,7 +812,7 @@ def forward( inputs_embeds=inputs_embeds, past_key_values_length=past_key_values_length, ) - encoder_outputs = self.encoder( + encoder_outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.encoder( embedding_output, attention_mask=extended_attention_mask, head_mask=head_mask, @@ -834,15 +822,11 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = encoder_outputs[0] + sequence_output = encoder_outputs.last_hidden_state sequence_output = self.LayerNorm(sequence_output) pooled_output = self.pooler(sequence_output) if self.pooler is not None else None - if not return_dict: - return (sequence_output, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPoolingAndCrossAttentions( last_hidden_state=sequence_output, pooler_output=pooled_output, @@ -881,6 +865,7 @@ def get_output_embeddings(self): def set_output_embeddings(self, new_embeddings): self.lm_head.decoder = new_embeddings + @can_return_tuple @add_start_docstrings_to_model_forward(ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC) def forward( @@ -898,9 +883,8 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, **kwargs, - ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]: + ) -> CausalLMOutputWithCrossAttentions: r""" encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if @@ -944,11 +928,10 @@ def forward( >>> prediction_logits = outputs.logits ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if labels is not None: use_cache = False - outputs = self.roberta_prelayernorm( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roberta_prelayernorm( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -961,10 +944,9 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state prediction_scores = self.lm_head(sequence_output) lm_loss = None @@ -978,10 +960,6 @@ def forward( **kwargs, ) - if not return_dict: - output = (prediction_scores,) + outputs[2:] - return ((lm_loss,) + output) if lm_loss is not None else output - return CausalLMOutputWithCrossAttentions( loss=lm_loss, logits=prediction_scores, @@ -1028,6 +1006,7 @@ def get_output_embeddings(self): def set_output_embeddings(self, new_embeddings): self.lm_head.decoder = new_embeddings + @can_return_tuple @add_start_docstrings_to_model_forward(ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -1051,8 +1030,7 @@ def forward( labels: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]: + ) -> MaskedLMOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., @@ -1061,9 +1039,7 @@ def forward( kwargs (`Dict[str, any]`, *optional*, defaults to `{}`): Used to hide legacy arguments that have been deprecated. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.roberta_prelayernorm( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roberta_prelayernorm( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1074,9 +1050,8 @@ def forward( encoder_attention_mask=encoder_attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state prediction_scores = self.lm_head(sequence_output) masked_lm_loss = None @@ -1086,10 +1061,6 @@ def forward( loss_fct = CrossEntropyLoss() masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) - if not return_dict: - output = (prediction_scores,) + outputs[2:] - return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output - return MaskedLMOutput( loss=masked_lm_loss, logits=prediction_scores, @@ -1149,6 +1120,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -1167,17 +1139,14 @@ def forward( labels: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]: + ) -> SequenceClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.roberta_prelayernorm( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roberta_prelayernorm( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1186,9 +1155,8 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state logits = self.classifier(sequence_output) loss = None @@ -1216,10 +1184,6 @@ def forward( loss_fct = BCEWithLogitsLoss() loss = loss_fct(logits, labels) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return SequenceClassifierOutput( loss=loss, logits=logits, @@ -1247,6 +1211,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward( ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length") ) @@ -1266,15 +1231,13 @@ def forward( inputs_embeds: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]: + ) -> MultipleChoiceModelOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above) """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None @@ -1287,7 +1250,7 @@ def forward( else None ) - outputs = self.roberta_prelayernorm( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roberta_prelayernorm( flat_input_ids, position_ids=flat_position_ids, token_type_ids=flat_token_type_ids, @@ -1296,9 +1259,8 @@ def forward( inputs_embeds=flat_inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - pooled_output = outputs[1] + pooled_output = outputs.pooler_output pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) @@ -1311,10 +1273,6 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(reshaped_logits, labels) - if not return_dict: - output = (reshaped_logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return MultipleChoiceModelOutput( loss=loss, logits=reshaped_logits, @@ -1345,6 +1303,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -1363,15 +1322,12 @@ def forward( labels: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]: + ) -> TokenClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.roberta_prelayernorm( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roberta_prelayernorm( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1380,11 +1336,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] - + sequence_output = outputs.last_hidden_state sequence_output = self.dropout(sequence_output) logits = self.classifier(sequence_output) @@ -1395,10 +1349,6 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return TokenClassifierOutput( loss=loss, logits=logits, @@ -1448,6 +1398,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(ROBERTA_PRELAYERNORM_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -1467,8 +1418,7 @@ def forward( end_positions: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]: + ) -> QuestionAnsweringModelOutput: r""" start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for position (index) of the start of the labelled span for computing the token classification loss. @@ -1479,9 +1429,7 @@ def forward( Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.roberta_prelayernorm( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roberta_prelayernorm( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1490,11 +1438,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] - + sequence_output = outputs.last_hidden_state logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) start_logits = start_logits.squeeze(-1).contiguous() @@ -1517,10 +1463,6 @@ def forward( end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 - if not return_dict: - output = (start_logits, end_logits) + outputs[2:] - return ((total_loss,) + output) if total_loss is not None else output - return QuestionAnsweringModelOutput( loss=total_loss, start_logits=start_logits, diff --git a/src/transformers/models/roc_bert/modeling_roc_bert.py b/src/transformers/models/roc_bert/modeling_roc_bert.py index b5ca264fb73d..d2d6c5aaf29b 100644 --- a/src/transformers/models/roc_bert/modeling_roc_bert.py +++ b/src/transformers/models/roc_bert/modeling_roc_bert.py @@ -16,7 +16,7 @@ import math import os -from typing import List, Optional, Tuple, Union +from typing import List, Optional, Tuple import torch import torch.utils.checkpoint @@ -41,6 +41,7 @@ add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, + can_return_tuple, logging, replace_return_docstrings, ) @@ -613,6 +614,7 @@ def __init__(self, config): self.layer = nn.ModuleList([RoCBertLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False + @can_return_tuple def forward( self, hidden_states: torch.Tensor, @@ -624,8 +626,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = False, output_hidden_states: Optional[bool] = False, - return_dict: Optional[bool] = True, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: + ) -> BaseModelOutputWithPastAndCrossAttentions: all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None @@ -678,18 +679,6 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - if not return_dict: - return tuple( - v - for v in [ - hidden_states, - next_decoder_cache, - all_hidden_states, - all_self_attentions, - all_cross_attentions, - ] - if v is not None - ) return BaseModelOutputWithPastAndCrossAttentions( last_hidden_state=hidden_states, past_key_values=next_decoder_cache, @@ -932,6 +921,7 @@ class PreTrainedModel for layer, heads in heads_to_prune.items(): self.encoder.layer[layer].attention.prune_heads(heads) + @can_return_tuple @add_start_docstrings_to_model_forward(ROC_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -955,8 +945,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: r""" encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if @@ -980,7 +969,6 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if self.config.is_decoder: use_cache = use_cache if use_cache is not None else self.config.use_cache @@ -1045,7 +1033,7 @@ def forward( inputs_embeds=inputs_embeds, past_key_values_length=past_key_values_length, ) - encoder_outputs = self.encoder( + encoder_outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.encoder( embedding_output, attention_mask=extended_attention_mask, head_mask=head_mask, @@ -1055,14 +1043,10 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = encoder_outputs[0] + sequence_output = encoder_outputs.last_hidden_state pooled_output = self.pooler(sequence_output) if self.pooler is not None else None - if not return_dict: - return (sequence_output, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPoolingAndCrossAttentions( last_hidden_state=sequence_output, pooler_output=pooled_output, @@ -1100,6 +1084,7 @@ def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings self.cls.predictions.bias = new_embeddings.bias + @can_return_tuple @add_start_docstrings_to_model_forward(ROC_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC) def forward( @@ -1124,9 +1109,8 @@ def forward( labels_token_type_ids: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, **kwargs, - ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]: + ) -> MaskedLMOutput: r""" attack_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): attack sample ids for computing the contrastive loss. Indices should be in `[-100, 0, ..., @@ -1185,9 +1169,8 @@ def forward( torch.Size([1, 11, 21128]) ``` """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.roc_bert( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roc_bert( input_ids, input_shape_ids=input_shape_ids, input_pronunciation_ids=input_pronunciation_ids, @@ -1198,10 +1181,10 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output, pooled_output = outputs[:2] + sequence_output = outputs.last_hidden_state + pooled_output = outputs.pooler_output prediction_scores = self.cls(sequence_output) loss = None @@ -1216,25 +1199,23 @@ def forward( target_inputs = torch.clone(labels_input_ids) target_inputs[target_inputs == -100] = self.config.pad_token_id - labels_output = self.roc_bert( + labels_output: BaseModelOutputWithPoolingAndCrossAttentions = self.roc_bert( target_inputs, input_shape_ids=labels_input_shape_ids, input_pronunciation_ids=labels_input_pronunciation_ids, attention_mask=labels_attention_mask, token_type_ids=labels_token_type_ids, - return_dict=return_dict, ) - attack_output = self.roc_bert( + attack_output: BaseModelOutputWithPoolingAndCrossAttentions = self.roc_bert( attack_input_ids, input_shape_ids=attack_input_shape_ids, input_pronunciation_ids=attack_input_pronunciation_ids, attention_mask=attack_attention_mask, token_type_ids=attack_token_type_ids, - return_dict=return_dict, ) - labels_pooled_output = labels_output[1] - attack_pooled_output = attack_output[1] + labels_pooled_output = labels_output.pooler_output + attack_pooled_output = attack_output.pooler_output pooled_output_norm = torch.nn.functional.normalize(pooled_output, dim=-1) labels_pooled_output_norm = torch.nn.functional.normalize(labels_pooled_output, dim=-1) @@ -1252,10 +1233,6 @@ def forward( else: loss = masked_lm_loss - if not return_dict: - output = (prediction_scores,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return MaskedLMOutput( loss=loss, logits=prediction_scores, @@ -1293,6 +1270,7 @@ def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings self.cls.predictions.bias = new_embeddings.bias + @can_return_tuple @add_start_docstrings_to_model_forward(ROC_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) def forward( self, @@ -1309,8 +1287,7 @@ def forward( labels: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]: + ) -> MaskedLMOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., @@ -1338,9 +1315,8 @@ def forward( '.' ``` """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.roc_bert( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roc_bert( input_ids, input_shape_ids=input_shape_ids, input_pronunciation_ids=input_pronunciation_ids, @@ -1353,10 +1329,9 @@ def forward( encoder_attention_mask=encoder_attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state prediction_scores = self.cls(sequence_output) masked_lm_loss = None @@ -1364,10 +1339,6 @@ def forward( loss_fct = CrossEntropyLoss() # -100 index = padding token masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) - if not return_dict: - output = (prediction_scores,) + outputs[2:] - return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output - return MaskedLMOutput( loss=masked_lm_loss, logits=prediction_scores, @@ -1431,6 +1402,7 @@ def set_output_embeddings(self, new_embeddings): self.cls.predictions.decoder = new_embeddings self.cls.predictions.bias = new_embeddings.bias + @can_return_tuple @add_start_docstrings_to_model_forward(ROC_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC) def forward( @@ -1450,9 +1422,8 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, **kwargs, - ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]: + ) -> CausalLMOutputWithCrossAttentions: r""" encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if @@ -1502,9 +1473,8 @@ def forward( >>> prediction_logits = outputs.logits ``` """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.roc_bert( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roc_bert( input_ids, input_shape_ids=input_shape_ids, input_pronunciation_ids=input_pronunciation_ids, @@ -1519,10 +1489,9 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state prediction_scores = self.cls(sequence_output) lm_loss = None @@ -1534,10 +1503,6 @@ def forward( **kwargs, ) - if not return_dict: - output = (prediction_scores,) + outputs[2:] - return ((lm_loss,) + output) if lm_loss is not None else output - return CausalLMOutputWithCrossAttentions( loss=lm_loss, logits=prediction_scores, @@ -1621,6 +1586,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(ROC_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION, @@ -1642,17 +1608,15 @@ def forward( labels: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]: + ) -> SequenceClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.roc_bert( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roc_bert( input_ids, input_shape_ids=input_shape_ids, input_pronunciation_ids=input_pronunciation_ids, @@ -1663,10 +1627,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - pooled_output = outputs[1] + pooled_output = outputs.pooler_output pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) @@ -1693,9 +1656,6 @@ def forward( elif self.config.problem_type == "multi_label_classification": loss_fct = BCEWithLogitsLoss() loss = loss_fct(logits, labels) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output return SequenceClassifierOutput( loss=loss, @@ -1725,6 +1685,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward( ROC_BERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length") ) @@ -1746,15 +1707,13 @@ def forward( labels: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]: + ) -> MultipleChoiceModelOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above) """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None @@ -1773,7 +1732,7 @@ def forward( else None ) - outputs = self.roc_bert( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roc_bert( input_ids, input_shape_ids=input_shape_ids, input_pronunciation_ids=input_pronunciation_ids, @@ -1784,10 +1743,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - pooled_output = outputs[1] + pooled_output = outputs.pooler_output pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) @@ -1798,10 +1756,6 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(reshaped_logits, labels) - if not return_dict: - output = (reshaped_logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return MultipleChoiceModelOutput( loss=loss, logits=reshaped_logits, @@ -1831,6 +1785,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(ROC_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_TOKEN_CLASSIFICATION, @@ -1852,15 +1807,12 @@ def forward( labels: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, TokenClassifierOutput]: + ) -> TokenClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.roc_bert( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roc_bert( input_ids, input_shape_ids=input_shape_ids, input_pronunciation_ids=input_pronunciation_ids, @@ -1871,10 +1823,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state sequence_output = self.dropout(sequence_output) logits = self.classifier(sequence_output) @@ -1884,10 +1835,6 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return TokenClassifierOutput( loss=loss, logits=logits, @@ -1913,6 +1860,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(ROC_BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_QA, @@ -1937,8 +1885,7 @@ def forward( end_positions: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]: + ) -> QuestionAnsweringModelOutput: r""" start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for position (index) of the start of the labelled span for computing the token classification loss. @@ -1949,9 +1896,7 @@ def forward( Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.roc_bert( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roc_bert( input_ids, input_shape_ids=input_shape_ids, input_pronunciation_ids=input_pronunciation_ids, @@ -1962,10 +1907,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) @@ -1989,10 +1933,6 @@ def forward( end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 - if not return_dict: - output = (start_logits, end_logits) + outputs[2:] - return ((total_loss,) + output) if total_loss is not None else output - return QuestionAnsweringModelOutput( loss=total_loss, start_logits=start_logits, diff --git a/src/transformers/models/splinter/modeling_splinter.py b/src/transformers/models/splinter/modeling_splinter.py index 174e766598a0..2fcd33d14386 100755 --- a/src/transformers/models/splinter/modeling_splinter.py +++ b/src/transformers/models/splinter/modeling_splinter.py @@ -16,7 +16,7 @@ import math from dataclasses import dataclass -from typing import List, Optional, Tuple, Union +from typing import List, Optional, Tuple import torch import torch.utils.checkpoint @@ -27,7 +27,13 @@ from ...modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, ModelOutput, QuestionAnsweringModelOutput from ...modeling_utils import PreTrainedModel from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer -from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging +from ...utils import ( + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + can_return_tuple, + logging, +) from .configuration_splinter import SplinterConfig @@ -424,6 +430,7 @@ def __init__(self, config): self.layer = nn.ModuleList([SplinterLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False + @can_return_tuple def forward( self, hidden_states: torch.Tensor, @@ -435,8 +442,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = False, output_hidden_states: Optional[bool] = False, - return_dict: Optional[bool] = True, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: + ) -> BaseModelOutputWithPastAndCrossAttentions: all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None @@ -489,18 +495,6 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - if not return_dict: - return tuple( - v - for v in [ - hidden_states, - next_decoder_cache, - all_hidden_states, - all_self_attentions, - all_cross_attentions, - ] - if v is not None - ) return BaseModelOutputWithPastAndCrossAttentions( last_hidden_state=hidden_states, past_key_values=next_decoder_cache, @@ -633,6 +627,7 @@ class PreTrainedModel for layer, heads in heads_to_prune.items(): self.encoder.layer[layer].attention.prune_heads(heads) + @can_return_tuple @add_start_docstrings_to_model_forward(SPLINTER_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -653,8 +648,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]: + ) -> BaseModelOutputWithPastAndCrossAttentions: r""" encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if @@ -678,7 +672,6 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if self.config.is_decoder: use_cache = use_cache if use_cache is not None else self.config.use_cache @@ -735,7 +728,7 @@ def forward( inputs_embeds=inputs_embeds, past_key_values_length=past_key_values_length, ) - encoder_outputs = self.encoder( + encoder_outputs: BaseModelOutputWithPastAndCrossAttentions = self.encoder( embedding_output, attention_mask=extended_attention_mask, head_mask=head_mask, @@ -745,12 +738,8 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = encoder_outputs[0] - - if not return_dict: - return (sequence_output,) + encoder_outputs[1:] + sequence_output = encoder_outputs.last_hidden_state return BaseModelOutputWithPastAndCrossAttentions( last_hidden_state=sequence_output, @@ -835,6 +824,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(SPLINTER_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -853,9 +843,8 @@ def forward( end_positions: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, question_positions: Optional[torch.LongTensor] = None, - ) -> Union[Tuple, QuestionAnsweringModelOutput]: + ) -> QuestionAnsweringModelOutput: r""" start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for position (index) of the start of the labelled span for computing the token classification loss. @@ -871,7 +860,6 @@ def forward( the only one for which start_logits and end_logits are calculated and they will be of shape `(batch_size, sequence_length)`. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict question_positions_were_none = False if question_positions is None: @@ -886,7 +874,7 @@ def forward( question_positions = question_position_for_each_example.unsqueeze(-1) question_positions_were_none = True - outputs = self.splinter( + outputs: BaseModelOutputWithPastAndCrossAttentions = self.splinter( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -895,10 +883,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state start_logits, end_logits = self.splinter_qass(sequence_output, question_positions) if question_positions_were_none: @@ -925,10 +912,6 @@ def forward( end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 - if not return_dict: - output = (start_logits, end_logits) + outputs[1:] - return ((total_loss,) + output) if total_loss is not None else output - return QuestionAnsweringModelOutput( loss=total_loss, start_logits=start_logits, @@ -989,6 +972,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward( SPLINTER_INPUTS_DOCSTRING.format("batch_size, num_questions, sequence_length") ) @@ -1004,9 +988,8 @@ def forward( end_positions: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, question_positions: Optional[torch.LongTensor] = None, - ) -> Union[Tuple, SplinterForPreTrainingOutput]: + ) -> SplinterForPreTrainingOutput: r""" start_positions (`torch.LongTensor` of shape `(batch_size, num_questions)`, *optional*): Labels for position (index) of the start of the labelled span for computing the token classification loss. @@ -1022,7 +1005,6 @@ def forward( the only one for which start_logits and end_logits are calculated and they will be of shape `(batch_size, sequence_length)`. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if question_positions is None and start_positions is not None and end_positions is not None: raise TypeError("question_positions must be specified in order to calculate the loss") @@ -1033,7 +1015,7 @@ def forward( elif question_positions is None: question_positions = self._prepare_question_positions(input_ids) - outputs = self.splinter( + outputs: BaseModelOutputWithPastAndCrossAttentions = self.splinter( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1042,10 +1024,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state batch_size, sequence_length, dim = sequence_output.size() # [batch_size, num_questions, sequence_length] start_logits, end_logits = self.splinter_qass(sequence_output, question_positions) @@ -1080,10 +1061,6 @@ def forward( ) total_loss = (start_loss + end_loss) / 2 - if not return_dict: - output = (start_logits, end_logits) + outputs[1:] - return ((total_loss,) + output) if total_loss is not None else output - return SplinterForPreTrainingOutput( loss=total_loss, start_logits=start_logits, diff --git a/src/transformers/models/x_clip/modeling_x_clip.py b/src/transformers/models/x_clip/modeling_x_clip.py index a6a365596107..07cc76e7acfa 100644 --- a/src/transformers/models/x_clip/modeling_x_clip.py +++ b/src/transformers/models/x_clip/modeling_x_clip.py @@ -16,7 +16,7 @@ import copy from dataclasses import dataclass -from typing import Any, Callable, Optional, Tuple, Union +from typing import Any, Callable, Optional, Tuple import torch import torch.utils.checkpoint @@ -30,6 +30,7 @@ ModelOutput, add_start_docstrings, add_start_docstrings_to_model_forward, + can_return_tuple, logging, replace_return_docstrings, torch_int, @@ -682,6 +683,7 @@ def __init__(self, config: XCLIPConfig): self.layers = nn.ModuleList([XCLIPEncoderLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False + @can_return_tuple def forward( self, inputs_embeds, @@ -689,8 +691,7 @@ def forward( causal_attention_mask: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutput]: + ) -> BaseModelOutput: r""" Args: inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): @@ -724,7 +725,6 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict encoder_states = () if output_hidden_states else None all_attentions = () if output_attentions else None @@ -757,8 +757,6 @@ def forward( if output_hidden_states: encoder_states = encoder_states + (hidden_states,) - if not return_dict: - return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) return BaseModelOutput( last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions ) @@ -773,6 +771,7 @@ def __init__(self, config: XCLIPTextConfig): self.encoder = XCLIPEncoder(config) self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) + @can_return_tuple @add_start_docstrings_to_model_forward(X_CLIP_TEXT_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=XCLIPTextConfig) def forward( @@ -782,8 +781,7 @@ def forward( position_ids: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPooling]: + ) -> BaseModelOutputWithPooling: r""" Returns: @@ -792,7 +790,6 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is None: raise ValueError("You have to specify either input_ids") @@ -812,25 +809,21 @@ def forward( # [batch_size, seq_len] -> [batch_size, 1, tgt_seq_len, src_seq_len] attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype) - encoder_outputs = self.encoder( + encoder_outputs: BaseModelOutput = self.encoder( inputs_embeds=hidden_states, attention_mask=attention_mask, causal_attention_mask=causal_attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - last_hidden_state = encoder_outputs[0] + last_hidden_state = encoder_outputs.last_hidden_state last_hidden_state = self.final_layer_norm(last_hidden_state) # text_embeds.shape = [batch_size, sequence_length, transformer.width] # take features from the eot embedding (eot_token is the highest number in each sequence) pooled_output = last_hidden_state[torch.arange(last_hidden_state.shape[0]), input_ids.argmax(dim=-1)] - if not return_dict: - return (last_hidden_state, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPooling( last_hidden_state=last_hidden_state, pooler_output=pooled_output, @@ -854,6 +847,7 @@ def get_input_embeddings(self) -> nn.Module: def set_input_embeddings(self, value): self.text_model.embeddings.token_embedding = value + @can_return_tuple @add_start_docstrings_to_model_forward(X_CLIP_TEXT_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=XCLIPTextConfig) def forward( @@ -863,8 +857,7 @@ def forward( position_ids: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPooling]: + ) -> BaseModelOutputWithPooling: r""" Returns: @@ -888,7 +881,6 @@ def forward( position_ids=position_ids, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) @@ -907,6 +899,7 @@ def __init__(self, config: XCLIPConfig): self.layers = nn.ModuleList([XCLIPVisionEncoderLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False + @can_return_tuple def forward( self, inputs_embeds, @@ -914,8 +907,7 @@ def forward( causal_attention_mask: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutput]: + ) -> BaseModelOutput: r""" Args: inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): @@ -949,7 +941,6 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict encoder_states = () if output_hidden_states else None all_attentions = () if output_attentions else None @@ -982,8 +973,6 @@ def forward( if output_hidden_states: encoder_states = encoder_states + (hidden_states,) - if not return_dict: - return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) return BaseModelOutput( last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions ) @@ -1004,6 +993,7 @@ def __init__(self, config: XCLIPVisionConfig): self.encoder = XCLIPVisionEncoder(config) self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) + @can_return_tuple @add_start_docstrings_to_model_forward(X_CLIP_VISION_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=XCLIPVisionConfig) def forward( @@ -1012,8 +1002,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, interpolate_pos_encoding: bool = False, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPooling]: + ) -> BaseModelOutputWithPooling: r""" Returns: @@ -1022,25 +1011,20 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding) hidden_states = self.pre_layernorm(hidden_states) - encoder_outputs = self.encoder( + encoder_outputs: BaseModelOutput = self.encoder( inputs_embeds=hidden_states, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - last_hidden_state = encoder_outputs[0] + last_hidden_state = encoder_outputs.last_hidden_state pooled_output = last_hidden_state[:, 0, :] pooled_output = self.post_layernorm(pooled_output) - if not return_dict: - return (last_hidden_state, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPooling( last_hidden_state=last_hidden_state, pooler_output=pooled_output, @@ -1062,6 +1046,7 @@ def __init__(self, config: XCLIPVisionConfig): def get_input_embeddings(self) -> nn.Module: return self.vision_model.embeddings.patch_embedding + @can_return_tuple @add_start_docstrings_to_model_forward(X_CLIP_VISION_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=XCLIPVisionConfig) def forward( @@ -1069,8 +1054,7 @@ def forward( pixel_values: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPooling]: + ) -> BaseModelOutputWithPooling: r""" Returns: @@ -1151,7 +1135,6 @@ def forward( pixel_values=pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) @@ -1166,33 +1149,27 @@ def __init__(self, config: XCLIPVisionConfig): self.position_embedding = nn.Parameter(torch.empty(1, config.num_frames, config.hidden_size)) self.encoder = XCLIPEncoder(config) + @can_return_tuple def forward( self, - hidden_states, + hidden_states: torch.Tensor, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutput]: + ) -> BaseModelOutputWithPooling: residual = hidden_states # add position embeddings hidden_states = hidden_states + self.position_embedding - encoder_outputs = self.encoder( + encoder_outputs: BaseModelOutput = self.encoder( inputs_embeds=hidden_states, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - last_hidden_state = encoder_outputs[0] - + last_hidden_state = encoder_outputs.last_hidden_state last_hidden_state = last_hidden_state.type(hidden_states.dtype) + residual - pooled_output = last_hidden_state.mean(dim=1, keepdim=False) - if not return_dict: - return (last_hidden_state, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPooling( last_hidden_state=last_hidden_state, pooler_output=pooled_output, @@ -1339,6 +1316,7 @@ def __init__(self, config: XCLIPConfig): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(X_CLIP_TEXT_INPUTS_DOCSTRING) def get_text_features( self, @@ -1347,7 +1325,6 @@ def get_text_features( position_ids: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, ) -> torch.FloatTensor: r""" Returns: @@ -1370,18 +1347,16 @@ def get_text_features( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - text_outputs = self.text_model( + text_outputs: BaseModelOutputWithPooling = self.text_model( input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - text_embeds = text_outputs[1] + text_embeds = text_outputs.pooler_output text_embeds = self.text_projection(text_embeds) return text_embeds @@ -1392,7 +1367,6 @@ def get_video_features( pixel_values: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, ) -> torch.FloatTensor: r""" Returns: @@ -1474,33 +1448,31 @@ def get_video_features( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict batch_size, num_frames, num_channels, height, width = pixel_values.shape pixel_values = pixel_values.reshape(-1, num_channels, height, width) - vision_outputs = self.vision_model( + vision_outputs: BaseModelOutputWithPooling = self.vision_model( pixel_values=pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - video_embeds = vision_outputs[1] + video_embeds = vision_outputs.pooler_output video_embeds = self.visual_projection(video_embeds) cls_features = video_embeds.view(batch_size, num_frames, -1) - mit_outputs = self.mit( + mit_outputs: BaseModelOutputWithPooling = self.mit( cls_features, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - video_embeds = mit_outputs[1] + video_embeds = mit_outputs.pooler_output return video_embeds + @can_return_tuple @add_start_docstrings_to_model_forward(X_CLIP_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=XCLIPOutput, config_class=XCLIPConfig) def forward( @@ -1513,8 +1485,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, interpolate_pos_encoding: bool = False, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, XCLIPOutput]: + ) -> XCLIPOutput: r""" Returns: @@ -1604,48 +1575,44 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict batch_size, num_frames, num_channels, height, width = pixel_values.shape pixel_values = pixel_values.reshape(-1, num_channels, height, width) - vision_outputs = self.vision_model( + vision_outputs: BaseModelOutputWithPooling = self.vision_model( pixel_values=pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, interpolate_pos_encoding=interpolate_pos_encoding, - return_dict=return_dict, ) - video_embeds = vision_outputs[1] + video_embeds = vision_outputs.pooler_output video_embeds = self.visual_projection(video_embeds) cls_features = video_embeds.view(batch_size, num_frames, -1) - mit_outputs = self.mit( + mit_outputs: BaseModelOutputWithPooling = self.mit( cls_features, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - video_embeds = mit_outputs[1] + video_embeds = mit_outputs.pooler_output - img_features = vision_outputs[0][:, 1:, :] + img_features = vision_outputs.last_hidden_state[:, 1:, :] img_features = self.prompts_visual_layernorm(img_features) img_features = img_features @ self.prompts_visual_projection img_features = img_features.view(batch_size, num_frames, -1, video_embeds.shape[-1]) img_features = img_features.mean(dim=1, keepdim=False) - text_outputs = self.text_model( + text_outputs: BaseModelOutputWithPooling = self.text_model( input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - text_embeds = text_outputs[1] + text_embeds = text_outputs.pooler_output text_embeds = self.text_projection(text_embeds) text_embeds = text_embeds.unsqueeze(0).expand(batch_size, -1, -1) @@ -1664,10 +1631,6 @@ def forward( if return_loss: loss = x_clip_loss(logits_per_text) - if not return_dict: - output = (logits_per_video, logits_per_text, text_embeds, video_embeds, text_outputs, vision_outputs) - return ((loss,) + output) if loss is not None else output - return XCLIPOutput( loss=loss, logits_per_video=logits_per_video, diff --git a/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py b/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py index 1fe5823c2066..2c5523fe61d4 100644 --- a/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py +++ b/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py @@ -16,7 +16,7 @@ """PyTorch XLM-RoBERTa model.""" import math -from typing import List, Optional, Tuple, Union +from typing import List, Optional, Tuple import torch import torch.utils.checkpoint @@ -46,6 +46,7 @@ add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, + can_return_tuple, get_torch_version, logging, replace_return_docstrings, @@ -585,6 +586,7 @@ def __init__(self, config): self.layer = nn.ModuleList([XLMRobertaLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False + @can_return_tuple def forward( self, hidden_states: torch.Tensor, @@ -596,8 +598,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = False, output_hidden_states: Optional[bool] = False, - return_dict: Optional[bool] = True, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: + ) -> BaseModelOutputWithPastAndCrossAttentions: all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None @@ -650,18 +651,6 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - if not return_dict: - return tuple( - v - for v in [ - hidden_states, - next_decoder_cache, - all_hidden_states, - all_self_attentions, - all_cross_attentions, - ] - if v is not None - ) return BaseModelOutputWithPastAndCrossAttentions( last_hidden_state=hidden_states, past_key_values=next_decoder_cache, @@ -835,6 +824,7 @@ class PreTrainedModel for layer, heads in heads_to_prune.items(): self.encoder.layer[layer].attention.prune_heads(heads) + @can_return_tuple @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -855,8 +845,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: r""" encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if @@ -881,7 +870,6 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if self.config.is_decoder: use_cache = use_cache if use_cache is not None else self.config.use_cache @@ -976,7 +964,7 @@ def forward( # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) - encoder_outputs = self.encoder( + encoder_outputs: BaseModelOutputWithPastAndCrossAttentions = self.encoder( embedding_output, attention_mask=extended_attention_mask, head_mask=head_mask, @@ -986,14 +974,10 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = encoder_outputs[0] + sequence_output = encoder_outputs.last_hidden_state pooled_output = self.pooler(sequence_output) if self.pooler is not None else None - if not return_dict: - return (sequence_output, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPoolingAndCrossAttentions( last_hidden_state=sequence_output, pooler_output=pooled_output, @@ -1030,6 +1014,7 @@ def get_output_embeddings(self): def set_output_embeddings(self, new_embeddings): self.lm_head.decoder = new_embeddings + @can_return_tuple @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC) def forward( @@ -1047,9 +1032,8 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, **kwargs, - ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]: + ) -> CausalLMOutputWithCrossAttentions: r""" encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if @@ -1093,11 +1077,10 @@ def forward( >>> prediction_logits = outputs.logits ```""" - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if labels is not None: use_cache = False - outputs = self.roberta( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1110,10 +1093,9 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state prediction_scores = self.lm_head(sequence_output) lm_loss = None @@ -1127,10 +1109,6 @@ def forward( **kwargs, ) - if not return_dict: - output = (prediction_scores,) + outputs[2:] - return ((lm_loss,) + output) if lm_loss is not None else output - return CausalLMOutputWithCrossAttentions( loss=lm_loss, logits=prediction_scores, @@ -1178,6 +1156,7 @@ def get_output_embeddings(self): def set_output_embeddings(self, new_embeddings): self.lm_head.decoder = new_embeddings + @can_return_tuple @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -1200,8 +1179,7 @@ def forward( labels: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]: + ) -> MaskedLMOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., @@ -1210,9 +1188,7 @@ def forward( kwargs (`Dict[str, any]`, *optional*, defaults to `{}`): Used to hide legacy arguments that have been deprecated. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.roberta( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1223,9 +1199,8 @@ def forward( encoder_attention_mask=encoder_attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state prediction_scores = self.lm_head(sequence_output) masked_lm_loss = None @@ -1235,10 +1210,6 @@ def forward( loss_fct = CrossEntropyLoss() masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) - if not return_dict: - output = (prediction_scores,) + outputs[2:] - return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output - return MaskedLMOutput( loss=masked_lm_loss, logits=prediction_scores, @@ -1299,6 +1270,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint="cardiffnlp/twitter-roberta-base-emotion", @@ -1318,17 +1290,14 @@ def forward( labels: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]: + ) -> SequenceClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.roberta( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1337,9 +1306,8 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state logits = self.classifier(sequence_output) loss = None @@ -1367,10 +1335,6 @@ def forward( loss_fct = BCEWithLogitsLoss() loss = loss_fct(logits, labels) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return SequenceClassifierOutput( loss=loss, logits=logits, @@ -1398,6 +1362,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward( XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length") ) @@ -1417,15 +1382,13 @@ def forward( inputs_embeds: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]: + ) -> MultipleChoiceModelOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above) """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None @@ -1438,7 +1401,7 @@ def forward( else None ) - outputs = self.roberta( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roberta( flat_input_ids, position_ids=flat_position_ids, token_type_ids=flat_token_type_ids, @@ -1447,9 +1410,8 @@ def forward( inputs_embeds=flat_inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - pooled_output = outputs[1] + pooled_output = outputs.pooler_output pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) @@ -1462,10 +1424,6 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(reshaped_logits, labels) - if not return_dict: - output = (reshaped_logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return MultipleChoiceModelOutput( loss=loss, logits=reshaped_logits, @@ -1497,6 +1455,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint="Jean-Baptiste/roberta-large-ner-english", @@ -1516,15 +1475,12 @@ def forward( labels: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]: + ) -> TokenClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.roberta( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1533,11 +1489,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] - + sequence_output = outputs.last_hidden_state sequence_output = self.dropout(sequence_output) logits = self.classifier(sequence_output) @@ -1548,10 +1502,6 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return TokenClassifierOutput( loss=loss, logits=logits, @@ -1602,6 +1552,7 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() + @can_return_tuple @add_start_docstrings_to_model_forward(XLM_ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint="deepset/roberta-base-squad2", @@ -1622,8 +1573,7 @@ def forward( end_positions: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]: + ) -> QuestionAnsweringModelOutput: r""" start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for position (index) of the start of the labelled span for computing the token classification loss. @@ -1634,9 +1584,7 @@ def forward( Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.roberta( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1645,11 +1593,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] - + sequence_output = outputs.last_hidden_state logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) start_logits = start_logits.squeeze(-1).contiguous() @@ -1672,10 +1618,6 @@ def forward( end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 - if not return_dict: - output = (start_logits, end_logits) + outputs[2:] - return ((total_loss,) + output) if total_loss is not None else output - return QuestionAnsweringModelOutput( loss=total_loss, start_logits=start_logits, diff --git a/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py b/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py index ad43c7903f4f..85577a767693 100644 --- a/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +++ b/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py @@ -15,7 +15,7 @@ """PyTorch XLM RoBERTa xl,xxl model.""" import math -from typing import List, Optional, Tuple, Union +from typing import List, Optional, Tuple import torch import torch.utils.checkpoint @@ -45,6 +45,7 @@ add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, + can_return_tuple, get_torch_version, logging, replace_return_docstrings, @@ -578,6 +579,7 @@ def __init__(self, config): self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.gradient_checkpointing = False + @can_return_tuple def forward( self, hidden_states, @@ -589,8 +591,7 @@ def forward( use_cache=None, output_attentions=False, output_hidden_states=False, - return_dict=True, - ): + ) -> BaseModelOutputWithPastAndCrossAttentions: if self.gradient_checkpointing and self.training: if use_cache: logger.warning_once( @@ -644,18 +645,6 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - if not return_dict: - return tuple( - v - for v in [ - hidden_states, - next_decoder_cache, - all_hidden_states, - all_self_attentions, - all_cross_attentions, - ] - if v is not None - ) return BaseModelOutputWithPastAndCrossAttentions( last_hidden_state=hidden_states, past_key_values=next_decoder_cache, @@ -816,6 +805,7 @@ class PreTrainedModel for layer, heads in heads_to_prune.items(): self.encoder.layer[layer].attention.prune_heads(heads) + @can_return_tuple @add_start_docstrings_to_model_forward(XLM_ROBERTA_XL_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -836,8 +826,7 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]: + ) -> BaseModelOutputWithPoolingAndCrossAttentions: r""" encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if @@ -862,7 +851,6 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if self.config.is_decoder: use_cache = use_cache if use_cache is not None else self.config.use_cache @@ -957,7 +945,7 @@ def forward( # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) - encoder_outputs = self.encoder( + encoder_outputs: BaseModelOutputWithPastAndCrossAttentions = self.encoder( embedding_output, attention_mask=extended_attention_mask, head_mask=head_mask, @@ -967,14 +955,10 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = encoder_outputs[0] + sequence_output = encoder_outputs.last_hidden_state pooled_output = self.pooler(sequence_output) if self.pooler is not None else None - if not return_dict: - return (sequence_output, pooled_output) + encoder_outputs[1:] - return BaseModelOutputWithPoolingAndCrossAttentions( last_hidden_state=sequence_output, pooler_output=pooled_output, @@ -1010,6 +994,7 @@ def set_output_embeddings(self, new_embeddings): self.lm_head.decoder = new_embeddings self.lm_head.bias = new_embeddings.bias + @can_return_tuple @add_start_docstrings_to_model_forward(XLM_ROBERTA_XL_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC) def forward( @@ -1027,9 +1012,8 @@ def forward( use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, **kwargs, - ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]: + ) -> CausalLMOutputWithCrossAttentions: r""" encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if @@ -1070,11 +1054,10 @@ def forward( >>> prediction_logits = outputs.logits ``` """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict if labels is not None: use_cache = False - outputs = self.roberta( + outputs: BaseModelOutputWithPastAndCrossAttentions = self.roberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1087,10 +1070,9 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state prediction_scores = self.lm_head(sequence_output) lm_loss = None @@ -1102,10 +1084,6 @@ def forward( **kwargs, ) - if not return_dict: - output = (prediction_scores,) + outputs[2:] - return ((lm_loss,) + output) if lm_loss is not None else output - return CausalLMOutputWithCrossAttentions( loss=lm_loss, logits=prediction_scores, @@ -1188,6 +1166,7 @@ def set_output_embeddings(self, new_embeddings): self.lm_head.decoder = new_embeddings self.lm_head.bias = new_embeddings.bias + @can_return_tuple @add_start_docstrings_to_model_forward(XLM_ROBERTA_XL_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -1208,8 +1187,7 @@ def forward( labels: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, MaskedLMOutput]: + ) -> MaskedLMOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., @@ -1218,9 +1196,7 @@ def forward( kwargs (`Dict[str, any]`, *optional*, defaults to `{}`): Used to hide legacy arguments that have been deprecated. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.roberta( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1231,9 +1207,8 @@ def forward( encoder_attention_mask=encoder_attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state prediction_scores = self.lm_head(sequence_output) masked_lm_loss = None @@ -1241,10 +1216,6 @@ def forward( loss_fct = CrossEntropyLoss() masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) - if not return_dict: - output = (prediction_scores,) + outputs[2:] - return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output - return MaskedLMOutput( loss=masked_lm_loss, logits=prediction_scores, @@ -1302,6 +1273,7 @@ def __init__(self, config): self.init_weights() + @can_return_tuple @add_start_docstrings_to_model_forward(XLM_ROBERTA_XL_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -1319,17 +1291,15 @@ def forward( labels: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, SequenceClassifierOutput]: + ) -> SequenceClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.roberta( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1338,9 +1308,8 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state logits = self.classifier(sequence_output) loss = None @@ -1366,10 +1335,6 @@ def forward( loss_fct = BCEWithLogitsLoss() loss = loss_fct(logits, labels) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return SequenceClassifierOutput( loss=loss, logits=logits, @@ -1395,6 +1360,7 @@ def __init__(self, config): self.init_weights() + @can_return_tuple @add_start_docstrings_to_model_forward( XLM_ROBERTA_XL_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length") ) @@ -1414,15 +1380,13 @@ def forward( inputs_embeds: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, MultipleChoiceModelOutput]: + ) -> MultipleChoiceModelOutput: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above) """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None @@ -1435,7 +1399,7 @@ def forward( else None ) - outputs = self.roberta( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roberta( flat_input_ids, position_ids=flat_position_ids, token_type_ids=flat_token_type_ids, @@ -1444,9 +1408,8 @@ def forward( inputs_embeds=flat_inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - pooled_output = outputs[1] + pooled_output = outputs.pooler_output pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) @@ -1457,10 +1420,6 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(reshaped_logits, labels) - if not return_dict: - output = (reshaped_logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return MultipleChoiceModelOutput( loss=loss, logits=reshaped_logits, @@ -1490,6 +1449,7 @@ def __init__(self, config): self.init_weights() + @can_return_tuple @add_start_docstrings_to_model_forward(XLM_ROBERTA_XL_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -1507,15 +1467,12 @@ def forward( labels: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, TokenClassifierOutput]: + ) -> TokenClassifierOutput: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.roberta( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1524,11 +1481,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] - + sequence_output = outputs.last_hidden_state sequence_output = self.dropout(sequence_output) logits = self.classifier(sequence_output) @@ -1546,10 +1501,6 @@ def forward( else: loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - return TokenClassifierOutput( loss=loss, logits=logits, @@ -1597,6 +1548,7 @@ def __init__(self, config): self.init_weights() + @can_return_tuple @add_start_docstrings_to_model_forward(XLM_ROBERTA_XL_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( checkpoint=_CHECKPOINT_FOR_DOC, @@ -1615,8 +1567,7 @@ def forward( end_positions: Optional[torch.LongTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, QuestionAnsweringModelOutput]: + ) -> QuestionAnsweringModelOutput: r""" start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for position (index) of the start of the labelled span for computing the token classification loss. @@ -1627,9 +1578,7 @@ def forward( Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.roberta( + outputs: BaseModelOutputWithPoolingAndCrossAttentions = self.roberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, @@ -1638,10 +1587,9 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, ) - sequence_output = outputs[0] + sequence_output = outputs.last_hidden_state logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) @@ -1665,10 +1613,6 @@ def forward( end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 - if not return_dict: - output = (start_logits, end_logits) + outputs[2:] - return ((total_loss,) + output) if total_loss is not None else output - return QuestionAnsweringModelOutput( loss=total_loss, start_logits=start_logits, diff --git a/tests/models/bert/test_modeling_bert.py b/tests/models/bert/test_modeling_bert.py index a6aac8e3829a..92893fbddd0e 100644 --- a/tests/models/bert/test_modeling_bert.py +++ b/tests/models/bert/test_modeling_bert.py @@ -463,7 +463,6 @@ class BertModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin if is_torch_available() else {} ) - fx_compatible = True model_split_percents = [0.5, 0.8, 0.9] # special case for ForPreTraining model diff --git a/tests/models/electra/test_modeling_electra.py b/tests/models/electra/test_modeling_electra.py index 7d451ff6378a..9029501934f1 100644 --- a/tests/models/electra/test_modeling_electra.py +++ b/tests/models/electra/test_modeling_electra.py @@ -403,7 +403,6 @@ class ElectraModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase) if is_torch_available() else {} ) - fx_compatible = True # special case for ForPreTraining model def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): diff --git a/tests/models/mobilebert/test_modeling_mobilebert.py b/tests/models/mobilebert/test_modeling_mobilebert.py index 126631fd9ce4..a2dd0a295041 100644 --- a/tests/models/mobilebert/test_modeling_mobilebert.py +++ b/tests/models/mobilebert/test_modeling_mobilebert.py @@ -282,7 +282,6 @@ class MobileBertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCa if is_torch_available() else {} ) - fx_compatible = True # special case for ForPreTraining model def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): diff --git a/tests/models/roberta/test_modeling_roberta.py b/tests/models/roberta/test_modeling_roberta.py index 4f4d93b07f4d..f3adeafb17ba 100644 --- a/tests/models/roberta/test_modeling_roberta.py +++ b/tests/models/roberta/test_modeling_roberta.py @@ -392,7 +392,6 @@ class RobertaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi if is_torch_available() else {} ) - fx_compatible = True model_split_percents = [0.5, 0.8, 0.9] def setUp(self):