diff --git a/config.json b/config.json index 5157c81..c482e77 100644 --- a/config.json +++ b/config.json @@ -9,7 +9,6 @@ "AutoModel": "modeling_deepseek.DeepseekV3Model", "AutoModelForCausalLM": "modeling_deepseek.DeepseekV3ForCausalLM" }, - "aux_loss_alpha": 0.001, "bos_token_id": 0, "eos_token_id": 1, "ep_size": 1, @@ -32,7 +31,6 @@ "num_hidden_layers": 61, "num_key_value_heads": 128, "num_nextn_predict_layers": 1, - "pretraining_tp": 1, "q_lora_rank": 1536, "qk_nope_head_dim": 128, "qk_rope_head_dim": 64, @@ -58,7 +56,6 @@ "rope_theta": 10000, "routed_scaling_factor": 2.5, "scoring_func": "sigmoid", - "seq_aux": true, "tie_word_embeddings": false, "topk_group": 4, "topk_method": "noaux_tc", diff --git a/configuration_deepseek.py b/configuration_deepseek.py index f2a4247..f549f2b 100644 --- a/configuration_deepseek.py +++ b/configuration_deepseek.py @@ -82,11 +82,6 @@ class DeepseekV3Config(PretrainedConfig): Beginning of stream token id. eos_token_id (`int`, *optional*, defaults to 2): End of stream token id. - pretraining_tp (`int`, *optional*, defaults to 1): - Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this - document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is - necessary to ensure exact reproducibility of the pretraining results. Please refer to [this - issue](https://github.com/pytorch/pytorch/issues/76232). tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether to tie weight embeddings rope_theta (`float`, *optional*, defaults to 10000.0): @@ -141,8 +136,6 @@ class DeepseekV3Config(PretrainedConfig): first_k_dense_replace = 3, norm_topk_prob = True, scoring_func = 'sigmoid', - aux_loss_alpha = 0.001, - seq_aux = True, hidden_act="silu", max_position_embeddings=4096, initializer_range=0.02, @@ -151,7 +144,6 @@ class DeepseekV3Config(PretrainedConfig): pad_token_id=None, bos_token_id=0, eos_token_id=1, - pretraining_tp=1, tie_word_embeddings=False, rope_theta=10000.0, rope_scaling=None, @@ -184,8 +176,6 @@ class DeepseekV3Config(PretrainedConfig): self.first_k_dense_replace = first_k_dense_replace self.norm_topk_prob = norm_topk_prob self.scoring_func = scoring_func - self.aux_loss_alpha = aux_loss_alpha - self.seq_aux = seq_aux # for backward compatibility if num_key_value_heads is None: num_key_value_heads = num_attention_heads @@ -194,7 +184,6 @@ class DeepseekV3Config(PretrainedConfig): self.hidden_act = hidden_act self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps - self.pretraining_tp = pretraining_tp self.use_cache = use_cache self.rope_theta = rope_theta self.rope_scaling = rope_scaling diff --git a/modeling_deepseek.py b/modeling_deepseek.py index 1192a00..28d9ea2 100644 --- a/modeling_deepseek.py +++ b/modeling_deepseek.py @@ -398,7 +398,6 @@ class MoEGate(nn.Module): self.n_routed_experts = config.n_routed_experts self.routed_scaling_factor = config.routed_scaling_factor self.scoring_func = config.scoring_func - self.seq_aux = config.seq_aux self.topk_method = config.topk_method self.n_group = config.n_group self.topk_group = config.topk_group @@ -455,7 +454,7 @@ class MoEGate(nn.Module): ) .reshape(bsz * seq_len, -1) ) # [n, e] - tmp_scores = scores_for_choice.masked_fill(~score_mask.bool(), 0.0) # [n, e] + tmp_scores = scores_for_choice.masked_fill(~score_mask.bool(), float("-inf")) # [n, e] _, topk_idx = torch.topk( tmp_scores, k=self.top_k, dim=-1, sorted=False )