Small fix

This commit is contained in:
msr2000 2025-03-27 12:02:04 +08:00
parent 9d8dd91366
commit 72234287cb
3 changed files with 1 additions and 16 deletions

View File

@ -9,7 +9,6 @@
"AutoModel": "modeling_deepseek.DeepseekV3Model", "AutoModel": "modeling_deepseek.DeepseekV3Model",
"AutoModelForCausalLM": "modeling_deepseek.DeepseekV3ForCausalLM" "AutoModelForCausalLM": "modeling_deepseek.DeepseekV3ForCausalLM"
}, },
"aux_loss_alpha": 0.001,
"bos_token_id": 0, "bos_token_id": 0,
"eos_token_id": 1, "eos_token_id": 1,
"ep_size": 1, "ep_size": 1,
@ -32,7 +31,6 @@
"num_hidden_layers": 61, "num_hidden_layers": 61,
"num_key_value_heads": 128, "num_key_value_heads": 128,
"num_nextn_predict_layers": 1, "num_nextn_predict_layers": 1,
"pretraining_tp": 1,
"q_lora_rank": 1536, "q_lora_rank": 1536,
"qk_nope_head_dim": 128, "qk_nope_head_dim": 128,
"qk_rope_head_dim": 64, "qk_rope_head_dim": 64,
@ -58,7 +56,6 @@
"rope_theta": 10000, "rope_theta": 10000,
"routed_scaling_factor": 2.5, "routed_scaling_factor": 2.5,
"scoring_func": "sigmoid", "scoring_func": "sigmoid",
"seq_aux": true,
"tie_word_embeddings": false, "tie_word_embeddings": false,
"topk_group": 4, "topk_group": 4,
"topk_method": "noaux_tc", "topk_method": "noaux_tc",

View File

@ -82,11 +82,6 @@ class DeepseekV3Config(PretrainedConfig):
Beginning of stream token id. Beginning of stream token id.
eos_token_id (`int`, *optional*, defaults to 2): eos_token_id (`int`, *optional*, defaults to 2):
End of stream token id. End of stream token id.
pretraining_tp (`int`, *optional*, defaults to 1):
Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
issue](https://github.com/pytorch/pytorch/issues/76232).
tie_word_embeddings (`bool`, *optional*, defaults to `False`): tie_word_embeddings (`bool`, *optional*, defaults to `False`):
Whether to tie weight embeddings Whether to tie weight embeddings
rope_theta (`float`, *optional*, defaults to 10000.0): rope_theta (`float`, *optional*, defaults to 10000.0):
@ -141,8 +136,6 @@ class DeepseekV3Config(PretrainedConfig):
first_k_dense_replace = 3, first_k_dense_replace = 3,
norm_topk_prob = True, norm_topk_prob = True,
scoring_func = 'sigmoid', scoring_func = 'sigmoid',
aux_loss_alpha = 0.001,
seq_aux = True,
hidden_act="silu", hidden_act="silu",
max_position_embeddings=4096, max_position_embeddings=4096,
initializer_range=0.02, initializer_range=0.02,
@ -151,7 +144,6 @@ class DeepseekV3Config(PretrainedConfig):
pad_token_id=None, pad_token_id=None,
bos_token_id=0, bos_token_id=0,
eos_token_id=1, eos_token_id=1,
pretraining_tp=1,
tie_word_embeddings=False, tie_word_embeddings=False,
rope_theta=10000.0, rope_theta=10000.0,
rope_scaling=None, rope_scaling=None,
@ -184,8 +176,6 @@ class DeepseekV3Config(PretrainedConfig):
self.first_k_dense_replace = first_k_dense_replace self.first_k_dense_replace = first_k_dense_replace
self.norm_topk_prob = norm_topk_prob self.norm_topk_prob = norm_topk_prob
self.scoring_func = scoring_func self.scoring_func = scoring_func
self.aux_loss_alpha = aux_loss_alpha
self.seq_aux = seq_aux
# for backward compatibility # for backward compatibility
if num_key_value_heads is None: if num_key_value_heads is None:
num_key_value_heads = num_attention_heads num_key_value_heads = num_attention_heads
@ -194,7 +184,6 @@ class DeepseekV3Config(PretrainedConfig):
self.hidden_act = hidden_act self.hidden_act = hidden_act
self.initializer_range = initializer_range self.initializer_range = initializer_range
self.rms_norm_eps = rms_norm_eps self.rms_norm_eps = rms_norm_eps
self.pretraining_tp = pretraining_tp
self.use_cache = use_cache self.use_cache = use_cache
self.rope_theta = rope_theta self.rope_theta = rope_theta
self.rope_scaling = rope_scaling self.rope_scaling = rope_scaling

View File

@ -398,7 +398,6 @@ class MoEGate(nn.Module):
self.n_routed_experts = config.n_routed_experts self.n_routed_experts = config.n_routed_experts
self.routed_scaling_factor = config.routed_scaling_factor self.routed_scaling_factor = config.routed_scaling_factor
self.scoring_func = config.scoring_func self.scoring_func = config.scoring_func
self.seq_aux = config.seq_aux
self.topk_method = config.topk_method self.topk_method = config.topk_method
self.n_group = config.n_group self.n_group = config.n_group
self.topk_group = config.topk_group self.topk_group = config.topk_group
@ -455,7 +454,7 @@ class MoEGate(nn.Module):
) )
.reshape(bsz * seq_len, -1) .reshape(bsz * seq_len, -1)
) # [n, e] ) # [n, e]
tmp_scores = scores_for_choice.masked_fill(~score_mask.bool(), 0.0) # [n, e] tmp_scores = scores_for_choice.masked_fill(~score_mask.bool(), float("-inf")) # [n, e]
_, topk_idx = torch.topk( _, topk_idx = torch.topk(
tmp_scores, k=self.top_k, dim=-1, sorted=False tmp_scores, k=self.top_k, dim=-1, sorted=False
) )