# 1. Tokenizer
tokenizer = AutoTokenizer.from_pretrained(
model_name,
trust_remote_code=True,
use_fast=False
)
# 2. 精度选择
compute_dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
# 3. 量化配置(QLoRA,适合 12GB+ 显卡)
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=compute_dtype,
bnb_4bit_use_double_quant=True,
)
# 4. 加载模型
model = AutoModelForCausalLM.from_pretrained(
model_name,
quantization_config=bnb_config,
device_map="auto",
trust_remote_code=True,
low_cpu_mem_usage=True,
attn_implementation="flash_attention_2", # 可选:加速注意力计算
)
---------------------------------------------------------------------------
ImportError Traceback (most recent call last)
Cell In[2], line 23
15 bnb_config = BitsAndBytesConfig(
16 load_in_4bit=True,
17 bnb_4bit_quant_type="nf4",
18 bnb_4bit_compute_dtype=compute_dtype,
19 bnb_4bit_use_double_quant=True,
20 )
22 # 4. 加载模型
---> 23 model = AutoModelForCausalLM.from_pretrained(
24 model_name,
25 quantization_config=bnb_config,
26 device_map="auto",
27 trust_remote_code=True,
28 low_cpu_mem_usage=True,
29 attn_implementation="flash_attention_2", # 可选:加速注意力计算
30 )
File e:\Python311\python11\Lib\site-packages\modelscope\utils\hf_util\patcher.py:285, in _patch_pretrained_class.<locals>.get_wrapped_class.<locals>.ClassWrapper.from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs)
281 with file_pattern_context(kwargs, module_class, cls):
282 model_dir = get_model_dir(pretrained_model_name_or_path,
283 **kwargs)
--> 285 module_obj = module_class.from_pretrained(
286 model_dir, *model_args, **kwargs)
288 if module_class.__name__.startswith('AutoModel'):
289 module_obj.model_dir = model_dir
File e:\Python311\python11\Lib\site-packages\transformers\models\auto\auto_factory.py:604, in _BaseAutoModelClass.from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs)
602 if model_class.config_class == config.sub_configs.get("text_config", None):
603 config = config.get_text_config()
--> 604 return model_class.from_pretrained(
605 pretrained_model_name_or_path, *model_args, config=config, **hub_kwargs, **kwargs
606 )
607 raise ValueError(
608 f"Unrecognized configuration class {config.__class__} for this kind of AutoModel: {cls.__name__}.\n"
609 f"Model type should be one of {', '.join(c.__name__ for c in cls._model_mapping)}."
610 )
File e:\Python311\python11\Lib\site-packages\transformers\modeling_utils.py:277, in restore_default_dtype.<locals>._wrapper(*args, **kwargs)
275 old_dtype = torch.get_default_dtype()
276 try:
--> 277 return func(*args, **kwargs)
278 finally:
279 torch.set_default_dtype(old_dtype)
File e:\Python311\python11\Lib\site-packages\transformers\modeling_utils.py:4971, in PreTrainedModel.from_pretrained(cls, pretrained_model_name_or_path, config, cache_dir, ignore_mismatched_sizes, force_download, local_files_only, token, revision, use_safetensors, weights_only, *model_args, **kwargs)
4968 config = copy.deepcopy(config) # We do not want to modify the config inplace in from_pretrained.
4969 with ContextManagers(model_init_context):
4970 # Let's make sure we don't run the init function of buffer modules
-> 4971 model = cls(config, *model_args, **model_kwargs)
4973 # Make sure to tie the weights correctly
4974 model.tie_weights()
File e:\Python311\python11\Lib\site-packages\transformers\models\qwen3\modeling_qwen3.py:435, in Qwen3ForCausalLM.__init__(self, config)
434 def __init__(self, config):
--> 435 super().__init__(config)
436 self.model = Qwen3Model(config)
437 self.vocab_size = config.vocab_size
File e:\Python311\python11\Lib\site-packages\transformers\modeling_utils.py:2076, in PreTrainedModel.__init__(self, config, *inputs, **kwargs)
2072 self.config = config
2074 # Check the attention implementation is supported, or set it if not yet set (on the internal attr, to avoid
2075 # setting it recursively)
-> 2076 self.config._attn_implementation_internal = self._check_and_adjust_attn_implementation(
2077 self.config._attn_implementation, is_init_check=True
2078 )
2080 # for initialization of the loss
2081 loss_type = self.__class__.__name__
File e:\Python311\python11\Lib\site-packages\transformers\modeling_utils.py:2686, in PreTrainedModel._check_and_adjust_attn_implementation(self, attn_implementation, is_init_check)
2684 raise e
2685 else:
-> 2686 applicable_attn_implementation = self.get_correct_attn_implementation(
2687 applicable_attn_implementation, is_init_check
2688 )
2689 # preload flash attention here to allow compile with fullgraph
2690 if applicable_attn_implementation.startswith("flash_attention"):
File e:\Python311\python11\Lib\site-packages\transformers\modeling_utils.py:2714, in PreTrainedModel.get_correct_attn_implementation(self, requested_attention, is_init_check)
2712 # Perform relevant checks
2713 if applicable_attention == "flash_attention_2":
-> 2714 self._flash_attn_2_can_dispatch(is_init_check)
2715 elif applicable_attention == "flash_attention_3":
2716 self._flash_attn_3_can_dispatch(is_init_check)
File e:\Python311\python11\Lib\site-packages\transformers\modeling_utils.py:2422, in PreTrainedModel._flash_attn_2_can_dispatch(self, is_init_check)
2419 return True
2421 if importlib.util.find_spec("flash_attn") is None:
-> 2422 raise ImportError(f"{preface} the package flash_attn seems to be not installed. {install_message}")
2423 else:
2424 # Check FA2 installed version compatibility
2425 flash_attention_version = version.parse(importlib.metadata.version("flash_attn"))
ImportError: FlashAttention2 has been toggled on, but it cannot be used due to the following error: the package flash_attn seems to be not installed. Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.
最新发布