diff --git a/modeling_deepseek.py b/modeling_deepseek.py index 1192a00..b00c9cf 100644 --- a/modeling_deepseek.py +++ b/modeling_deepseek.py @@ -43,7 +43,6 @@ from transformers.modeling_outputs import ( from transformers.modeling_utils import PreTrainedModel from transformers.pytorch_utils import ( ALL_LAYERNORM_LAYERS, - is_torch_greater_or_equal_than_1_13, ) from transformers.utils import ( add_start_docstrings, @@ -66,9 +65,6 @@ if is_flash_attn_2_available(): # This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph. # It means that the function will not be traced through and simply appear as a node in the graph. if is_torch_fx_available(): - if not is_torch_greater_or_equal_than_1_13: - import torch.fx - _prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask)