mindspore-lab
diff --git a/‎mindnlp/__init__.py
+2-1 b/‎mindnlp/__init__.py
+2-1
diff --git a/‎mindnlp/_legacy/functional.py
-2 b/‎mindnlp/_legacy/functional.py
-2
diff --git a/‎mindnlp/configs.py
+1-1 b/‎mindnlp/configs.py
+1-1
diff --git a/‎mindnlp/injection.py
+28-8 b/‎mindnlp/injection.py
+28-8
diff --git a/‎mindnlp/transformers/configuration_utils.py
+11-1 b/‎mindnlp/transformers/configuration_utils.py
+11-1
diff --git a/‎mindnlp/transformers/generation/utils.py
+2-2 b/‎mindnlp/transformers/generation/utils.py
+2-2
diff --git a/‎mindnlp/transformers/modeling_utils.py
+84-15 b/‎mindnlp/transformers/modeling_utils.py
+84-15
diff --git a/‎mindnlp/transformers/models/auto/modeling_auto.py
+1 b/‎mindnlp/transformers/models/auto/modeling_auto.py
+1
@@ -17,7 +17,8 @@
 MindNLP library.
 """
 import os
-os.environ["HF_ENDPOINT"] = 'https://hf-mirror.com/'
+if os.environ.get('HF_ENDPOINT', None) is None:
+    os.environ["HF_ENDPOINT"] = 'https://hf-mirror.com/'
 os.environ["MS_DEV_FORCE_ACL"] = '1'
 
 import mindspore
 
@@ -1463,8 +1463,6 @@ def sumproduct_pair(left_, right_, sum_dims_, keep_dim_):
 ELLIPSIS = 52
 
 def einsum(equation, *operands):
-    if mindspore.get_context('device_target') == 'GPU':
-        return _get_cache_prim(ops.Einsum)(equation)(operands)
     assert operands, "einsum(): must provide at least one operand"
 
     arrow_pos = equation.find("->")
 
@@ -32,7 +32,7 @@
 # for modelscope models
 MS_URL_BASE = "https://modelscope.cn/api/v1/models/mindnlp/{}/repo?Revision=master&FilePath={}"
 # for huggingface url
-HF_URL_BASE = 'https://hf-mirror.com/{}/resolve/main/{}'
+HF_URL_BASE = os.environ.get('HF_ENDPOINT', 'https://hf-mirror.com/') + '{}/resolve/main/{}'
 
 ENV_VARS_TRUE_VALUES = {"1", "ON", "YES", "TRUE"}
 MINDNLP_CACHE = os.getenv("MINDNLP_CACHE", DEFAULT_ROOT)
 
@@ -275,10 +275,16 @@ def custom_multinomial(probabilities, num_samples, replacement=True):
         samples = ops.searchsorted(cumulative_probs, uniform_samples, right=True)
     else:
         # without replacement
-        indices = ops.arange(probabilities.shape[-1])
-        shuffled_indices = ops.randperm(probabilities.shape[-1]).unsqueeze(0).broadcast_to((probabilities.shape[:-1], -1))
-        selected_indices = shuffled_indices[:, :num_samples]
-        samples = indices[selected_indices]
+        n_dist = 1
+        if probabilities.ndim > 1:
+            n_dist = probabilities.shape[-2]
+        random_uniform = ops.rand((n_dist * probabilities.shape[-1],))
+        if n_dist != 1:
+            random_uniform = random_uniform.reshape(n_dist, probabilities.shape[-1])
+
+        vals = ops.div(ops.log(random_uniform), probabilities + 1e-6)
+        _, samples = ops.top_k(vals, num_samples)
+
     return samples
 
 if DEVICE_TARGET == 'GPU':
@@ -291,6 +297,22 @@ def eq(self, other):
     Tensor.eq = eq
     StubTensor.eq = eq
 
+
+def _eq(self, other):
+    if not isinstance(other, (int, float, Tensor)):
+        return False
+    if isinstance(other, Tensor) and self.shape != other.shape:
+        return False
+    if id(self) == id(other):
+        return True
+    # bool type is not supported for `Equal` operator in backend.
+    if self.dtype == mstype.bool_ or (isinstance(other, Tensor) and other.dtype == mstype.bool_):
+        self = self.to(mstype.int32)
+        other = other.to(mstype.int32)
+    return ops.eq(self, other)
+
+Parameter.__eq__ = _eq
+
 class Dense(nn.Cell):
     """patched Dense"""
     def __init__(self,
@@ -481,12 +503,10 @@ def extend_repr(self):
         return f'normalized_shape={self.normalized_shape}, begin_norm_axis={self.begin_norm_axis}, ' \
                f'begin_params_axis={self.begin_params_axis}, gamma={self.gamma}, beta={self.beta}'
 
-
 def half(self):
     """patched nn.Cell.half"""
-    for param in self.get_parameters():
-        if param.dtype in (mindspore.float32, mindspore.float16):
-            param.set_dtype(mindspore.float16)
+    self.to_float(mindspore.float16)
+    return self
 
 nn.Cell.half = half
 
 
@@ -97,7 +97,7 @@ def __init__(self, **kwargs):
             if not isinstance(self.id2label, dict):
                 raise ValueError("Argument id2label should be a dictionary.")
             num_labels = kwargs.pop("num_labels", None)
-            print(num_labels)
+
             if num_labels is not None and len(self.id2label) != num_labels:
                 logger.warning(
                     f"You passed along `num_labels={num_labels}` with an incompatible id to label map: "
@@ -532,6 +532,16 @@ def to_file(self, save_path):
         with open(os.path.join(save_path, 'config.json'), encoding='utf-8') as f:
             json.dump(output_dict, f, sort_keys=True, indent=2)
 
+    def update(self, config_dict: Dict[str, Any]):
+        """
+        Updates attributes of this class with attributes from `config_dict`.
+
+        Args:
+            config_dict (`Dict[str, Any]`): Dictionary of attributes that should be updated for this class.
+        """
+        for key, value in config_dict.items():
+            setattr(self, key, value)
+
     def save_pretrained(self, save_directory: Union[str, os.PathLike], **kwargs):
         """
         Save a configuration object to the directory `save_directory`, so that it can be re-loaded using the
 
@@ -3237,7 +3237,7 @@ def beam_sample(
 
             probs = ops.softmax(next_token_scores, axis=-1)
 
-            next_tokens = ops.multinomial(probs, num_samples=2 * num_beams, replacement=False)
+            next_tokens = ops.multinomial(probs, num_samples=2 * num_beams)
             next_token_scores = ops.gather_elements(next_token_scores, -1, next_tokens)
 
             next_token_scores, _indices = ops.sort(next_token_scores, descending=True, axis=1)
@@ -4292,7 +4292,7 @@ def assisted_decoding(
             # 3. Obtain the next tokens from the original model logits.
             if do_sample:
                 probs = ops.softmax(new_logits, axis=-1)
-                selected_tokens = ops.multinomial(probs[0, :, :], num_samples=1, replacement=False).squeeze(1)[None, :]
+                selected_tokens = ops.multinomial(probs[0, :, :], num_samples=1).squeeze(1)[None, :]
             else:
                 selected_tokens = new_logits.argmax(axis=-1)
 
 
@@ -23,6 +23,7 @@
 # pylint: disable=unused-argument
 # pylint: disable=attribute-defined-outside-init
 # pylint: disable=self-cls-assignment
+# pylint: disable=no-name-in-module
 """
 Abstract class for Pretrained models.
 """
@@ -38,6 +39,7 @@
 import mindspore
 from mindspore import load_checkpoint, save_checkpoint
 from mindspore import nn, ops, Tensor, Parameter
+from mindspore._c_expression import MixedPrecisionType
 
 from mindnlp.configs import MS_URL_BASE, HF_URL_BASE, PT_WEIGHTS_NAME, WEIGHTS_NAME, WEIGHTS_INDEX_NAME, PT_WEIGHTS_INDEX_NAME
 from mindnlp.utils.download import is_remote_url, download_url, cached_file, get_checkpoint_shard_files
@@ -59,11 +61,20 @@ class CellUtilMixin:
     """
 
     @property
-    def dtype(self) -> mindspore.dtype:
+    def dtype(self) -> mindspore.TensorType:
         """
         `mindspore.dtype`: The dtype of the module (assuming that all the module parameters have the same dtype).
         """
-        return mindspore.float32
+        if not hasattr(self, 'get_mixed_precision_type'):
+            return mindspore.float32
+        mixed_type = self.get_mixed_precision_type()
+        if mixed_type == MixedPrecisionType.FP16:
+            cast_type = mindspore.float16
+        elif mixed_type == MixedPrecisionType.BF16:
+            cast_type = mindspore.bfloat16
+        else:
+            cast_type = mindspore.float32
+        return cast_type
 
     @staticmethod
     def create_extended_attention_mask_for_decoder(input_shape, attention_mask):
@@ -387,7 +398,7 @@ def tie_weights(self):
             self._tie_encoder_decoder_weights(
                 self.encoder, self.decoder, self.base_model_prefix)
 
-        for cell in self.cells():
+        for _, cell in self.cells_and_names():
             if hasattr(cell, "_tie_weights"):
                 cell._tie_weights()
 
@@ -398,20 +409,27 @@ def _tie_encoder_decoder_weights(encoder: nn.Cell, decoder: nn.Cell, base_model_
     def _tie_or_clone_weights(self, output_embeddings, input_embeddings):
         """ Tie or clone module weights depending of weither we are using or not
         """
-        output_embeddings.weight = input_embeddings.embedding_table
-        output_embeddings._params['weight'] = input_embeddings.embedding_table
+        if hasattr(output_embeddings, 'weight'):
+            output_embeddings.weight = input_embeddings.embedding_table
+            output_embeddings._params['weight'] = input_embeddings.embedding_table
+
+        if hasattr(output_embeddings, 'embedding_table'):
+            output_embeddings.embedding_table = input_embeddings.embedding_table
+            output_embeddings._params['embedding_table'] = input_embeddings.embedding_table
+
         if getattr(output_embeddings, "bias", None) is not None:
             if output_embeddings.weight.shape[0] == output_embeddings.bias.shape[0]:
                 pass
             else:
                 # instantial a new Parameter since mindspore.Parameter do not support assign_value with different shape
-                output_embeddings.bias = Parameter(ops.pad(
+                replace_references(output_embeddings.bias, Parameter(ops.pad(
                     output_embeddings.bias.data,
                     (0, output_embeddings.weight.shape[0] -
                     output_embeddings.bias.shape[0]),
                     "constant",
                     0,
-                ))
+                ), name=output_embeddings.bias.name, requires_grad=output_embeddings.bias.requires_grad))
+
         if hasattr(output_embeddings, "out_channels") and hasattr(input_embeddings, "vocab_size"):
             output_embeddings.out_channels = input_embeddings.vocab_size
 
@@ -435,7 +453,6 @@ def resize_token_embeddings(
         model_embeds = self._resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
         if new_num_tokens is None and pad_to_multiple_of is None:
             return model_embeds
-
         # Update base model and current model config
         self.config.vocab_size = model_embeds.embedding_table.shape[0]
         self.vocab_size = model_embeds.embedding_table.shape[0]
@@ -641,6 +658,8 @@ def from_pretrained(
         output_loading_info = kwargs.pop("output_loading_info", False)
         subfolder = kwargs.pop("subfolder", "")
         variant = kwargs.pop("variant", None)
+        ms_dtype = kwargs.pop("ms_dtype", None)
+        _ = kwargs.pop('low_cpu_mem_usage', None)
 
         is_sharded = False
         # Load config if we don't provide a configuration
@@ -800,6 +819,8 @@ def from_pretrained(
 
         # Instantiate model.
         model = cls(config, *model_args, **model_kwargs)
+        if ms_dtype:
+            model = model.to_float(ms_dtype)
 
         if from_pt:
             if is_sharded:
@@ -827,43 +848,66 @@ def load_ckpt(resolved_archive_file):
         keys_missing = list(model.parameters_dict().keys())
         param_id_set = set()
 
+        use_keep_in_fp32_modules = False
+        if model._keep_in_fp32_modules:
+            use_keep_in_fp32_modules = True
 
         def load_param_into_net(model: nn.Cell, param_dict: dict, prefix: str):
+            keep_in_fp32_modules = model._keep_in_fp32_modules
             keys_unexpected = list(param_dict.keys())
 
             has_prefix_module = any(s.startswith(prefix) for s in keys_unexpected)
             expects_prefix_module = any(s.startswith(prefix) for s in keys_missing)
 
             for pname_in_net, param in model.parameters_and_names():
                 if has_prefix_module and not expects_prefix_module:
-                    param_name = prefix + '.' + param.name
+                    param_name = prefix + '.' + pname_in_net
                 elif not has_prefix_module and expects_prefix_module:
-                    param_name = param.name.replace(f'{prefix}.', '')
+                    param_name = pname_in_net.replace(f'{prefix}.', '')
                 else:
-                    param_name = param.name
+                    param_name = pname_in_net
 
                 if id(param) in param_id_set:
                     # for tied params
                     if pname_in_net in keys_missing:
                         keys_missing.remove(pname_in_net)
 
-                    if pname_in_net in keys_unexpected:
-                        keys_unexpected.remove(pname_in_net)
+                    if param_name in keys_missing:
+                        keys_missing.remove(param_name)
+
+                    if param_name in keys_unexpected:
+                        keys_unexpected.remove(param_name)
                     continue
                 new_param = param_dict.pop(param_name, None)
+
                 if new_param is not None:
+                    use_replace = False
                     if new_param.shape != param.shape:
                         if not ignore_mismatched_sizes:
                             raise RuntimeError(f'The shape of parameter `{param.name} is {param.shape}, but got mismatch parameter'
                                             f' `{param_name} with shape {new_param.shape} in checkpoint, '
                                             f'\n\tYou may consider adding `ignore_mismatched_sizes=True` in the model `from_pretrained` method.')
                         logger.warning(f'The shape of parameter `{param.name} is {param.shape}, but got mismatch parameter'
                                         f' `{param_name} with shape {new_param.shape} in checkpoint, ')
-                        param = Parameter(new_param.data, param.name)
+                        continue
+
+                    if new_param.dtype != param.dtype:
+                        use_replace = True
+
+                    if ms_dtype:
+                        use_replace = True
+                        new_param = new_param.astype(ms_dtype)
+
+                    if use_keep_in_fp32_modules and \
+                        any(module_to_keep_in_fp32 in pname_in_net.split(".") for module_to_keep_in_fp32 in keep_in_fp32_modules):
+                        new_param = new_param.astype(mindspore.float32)
+
+                    if use_replace:
+                        replace_references(param, Parameter(new_param, name=param.name, requires_grad=param.requires_grad))
                     else:
                         param.set_data(new_param)
                     keys_unexpected.remove(param_name)
-                    keys_missing.remove(param.name)
+                    keys_missing.remove(pname_in_net)
                     param_id_set.add(id(param))
 
             return keys_unexpected, keys_missing
@@ -1340,6 +1384,7 @@ def convert_torch_to_mindspore(pth_file):
                 key = key.replace('.bias', '.beta')
         if 'wpe' in key or 'wte' in key or \
             'embeddings' in key or 'embedding' in key or \
+            'shared' in key or 'relative_attention_bias' in key or \
             'embed_' in key or '_embed' in key and \
             'embedding_hidden_mapping_in' not in key: # for albert
             key = key.replace('weight', 'embedding_table')
@@ -1734,3 +1779,27 @@ def construct(self, hidden_states: Tensor, cls_index: Optional[Tensor] = None) -
         output = self.activation(output)
         output = self.last_dropout(output)
         return output
+
+def replace_references(old_obj, new_obj):
+    """use replace_references instead of Tensor.set_data due to mindspore errors."""
+    # Get all objects referring to old_obj
+    referrers = gc.get_referrers(old_obj)
+
+    # Replace references
+    for referrer in referrers:
+        if isinstance(referrer, dict):
+            # If the reference is in a dictionary
+            for key, value in referrer.items():
+                if value is old_obj:
+                    referrer[key] = new_obj
+        elif isinstance(referrer, list):
+            # If the reference is in a list or tuple
+            index = referrer.index(old_obj)
+            referrer[index] = new_obj
+        elif isinstance(referrer, tuple):
+            pass
+        elif hasattr(referrer, '__dict__'):
+            # If the reference is in the __dict__ of an object
+            for key, value in referrer.__dict__.items():
+                if value is old_obj:
+                    setattr(referrer, key, new_obj)
@@ -33,6 +33,7 @@
         ("bert", "BertModel"),
         ("roberta", "RobertaModel"),
         ("gpt_bigcode", "GPTBigCodeModel"),
+        ("t5", "T5Model"),
     ]
 )
Original file line number	Diff line number	Diff line change
`@@ -33,6 +33,7 @@`
`33`	`33`	`("bert", "BertModel"),`
`34`	`34`	`("roberta", "RobertaModel"),`
`35`	`35`	`("gpt_bigcode", "GPTBigCodeModel"),`
	`36`	`+ ("t5", "T5Model"),`
`36`	`37`	`]`
`37`	`38`	`)`
`38`	`39`