使用DGL库的时候报下面所示的错误
Traceback (most recent call last):
File "/home/zhaozhimiao/ldd/GraphMAE/main_transductive.py", line 169, in <module>
main(args)
File "/home/zhaozhimiao/ldd/GraphMAE/main_transductive.py", line 138, in main
model = pretrain(model, graph, x, optimizer, max_epoch, device, scheduler, lr_f, weight_decay_f, max_epoch_f, linear_prob, logger)
File "/home/zhaozhimiao/ldd/GraphMAE/main_transductive.py", line 58, in pretrain
loss, loss_dict = model(graph, x)
File "/home/zhaozhimiao/anaconda3/envs/GraphMAE_Ldd/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/zhaozhimiao/anaconda3/envs/GraphMAE_Ldd/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
return forward_call(*args, **kwargs)
File "/fs1/private/user/zhaozhimiao/ldd/GraphMAE/graphmae/models/edcoder.py", line 229, in forward
loss = self.mask_attr_prediction(g, x)
File "/fs1/private/user/zhaozhimiao/ldd/GraphMAE/graphmae/models/edcoder.py", line 241, in mask_attr_prediction
enc_rep, all_hidden = self.encoder(use_g, use_x, return_hidden=True)
File "/home/zhaozhimiao/anaconda3/envs/GraphMAE_Ldd/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/zhaozhimiao/anaconda3/envs/GraphMAE_Ldd/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
return forward_call(*args, **kwargs)
File "/fs1/private/user/zhaozhimiao/ldd/GraphMAE/graphmae/models/gcn.py", line 68, in forward
h = self.gcn_layers[l](g, h)
File "/home/zhaozhimiao/anaconda3/envs/GraphMAE_Ldd/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/zhaozhimiao/anaconda3/envs/GraphMAE_Ldd/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
return forward_call(*args, **kwargs)
File "/fs1/private/user/zhaozhimiao/ldd/GraphMAE/graphmae/models/gcn.py", line 153, in forward
graph.update_all(aggregate_fn, fn.sum(msg='m', out='h'))
File "/home/zhaozhimiao/anaconda3/envs/GraphMAE_Ldd/lib/python3.9/site-packages/dgl/heterograph.py", line 5112, in update_all
ndata = core.message_passing(
File "/home/zhaozhimiao/anaconda3/envs/GraphMAE_Ldd/lib/python3.9/site-packages/dgl/core.py", line 398, in message_passing
ndata = invoke_gspmm(g, mfunc, rfunc)
File "/home/zhaozhimiao/anaconda3/envs/GraphMAE_Ldd/lib/python3.9/site-packages/dgl/core.py", line 359, in invoke_gspmm
z = op(graph, x, y)
File "/home/zhaozhimiao/anaconda3/envs/GraphMAE_Ldd/lib/python3.9/site-packages/dgl/ops/spmm.py", line 173, in func
return gspmm(g, binary_op, reduce_op, x, y)
File "/home/zhaozhimiao/anaconda3/envs/GraphMAE_Ldd/lib/python3.9/site-packages/dgl/ops/spmm.py", line 79, in gspmm
ret = gspmm_internal(
File "/home/zhaozhimiao/anaconda3/envs/GraphMAE_Ldd/lib/python3.9/site-packages/dgl/backend/pytorch/sparse.py", line 1032, in gspmm
return GSpMM.apply(*args)
File "/home/zhaozhimiao/anaconda3/envs/GraphMAE_Ldd/lib/python3.9/site-packages/torch/autograd/function.py", line 574, in apply
return super().apply(*args, **kwargs) # type: ignore[misc]
File "/home/zhaozhimiao/anaconda3/envs/GraphMAE_Ldd/lib/python3.9/site-packages/dgl/backend/pytorch/sparse.py", line 165, in forward
out, (argX, argY) = _gspmm(gidx, op, reduce_op, X, Y)
File "/home/zhaozhimiao/anaconda3/envs/GraphMAE_Ldd/lib/python3.9/site-packages/dgl/_sparse_ops.py", line 239, in _gspmm
_CAPI_DGLKernelSpMM(
File "dgl/_ffi/_cython/./function.pxi", line 295, in dgl._ffi._cy3.core.FunctionBase.__call__
File "dgl/_ffi/_cython/./function.pxi", line 241, in dgl._ffi._cy3.core.FuncCall
dgl._ffi.base.DGLError: [09:33:12] /opt/dgl/src/array/cuda/./spmm.cuh:220: Check failed: e == CUSPARSE_STATUS_SUCCESS: CUSPARSE ERROR: 1
Stack trace:
[bt] (0) /home/zhaozhimiao/anaconda3/envs/GraphMAE_Ldd/lib/python3.9/site-packages/dgl/libdgl.so(+0x1524704) [0x7fd38b6a7704]
[bt] (1) /home/zhaozhimiao/anaconda3/envs/GraphMAE_Ldd/lib/python3.9/site-packages/dgl/libdgl.so(void dgl::aten::(anonymous namespace)::CusparseCsrmm2<float, long>(DGLContext const&, dgl::aten::CSRMatrix const&, float const*, float const*, float*, int, bool)+0x175) [0x7fd38b7a6fc5]
[bt] (2) /home/zhaozhimiao/anaconda3/envs/GraphMAE_Ldd/lib/python3.9/site-packages/dgl/libdgl.so(void dgl::aten::SpMMCsr<2, long, float>(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, dgl::BcastOff const&, dgl::aten::CSRMatrix const&, dgl::runtime::NDArray, dgl::runtime::NDArray, dgl::runtime::NDArray, std::vector<dgl::runtime::NDArray, std::allocator<dgl::runtime::NDArray> >)+0xd05) [0x7fd38b7a8315]
[bt] (3) /home/zhaozhimiao/anaconda3/envs/GraphMAE_Ldd/lib/python3.9/site-packages/dgl/libdgl.so(dgl::aten::SpMM(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::shared_ptr<dgl::BaseHeteroGraph>, dgl::runtime::NDArray, dgl::runtime::NDArray, dgl::runtime::NDArray, std::vector<dgl::runtime::NDArray, std::allocator<dgl::runtime::NDArray> >)+0x10c4) [0x7fd38a9edd14]
[bt] (4) /home/zhaozhimiao/anaconda3/envs/GraphMAE_Ldd/lib/python3.9/site-packages/dgl/libdgl.so(+0x88a45b) [0x7fd38aa0d45b]
[bt] (5) /home/zhaozhimiao/anaconda3/envs/GraphMAE_Ldd/lib/python3.9/site-packages/dgl/libdgl.so(+0x88a701) [0x7fd38aa0d701]
[bt] (6) /home/zhaozhimiao/anaconda3/envs/GraphMAE_Ldd/lib/python3.9/site-packages/dgl/libdgl.so(DGLFuncCall+0x4f) [0x7fd38aa5d4ef]
[bt] (7) /home/zhaozhimiao/anaconda3/envs/GraphMAE_Ldd/lib/python3.9/site-packages/dgl/_ffi/_cy3/core.cpython-39-x86_64-linux-gnu.so(+0x1dd61) [0x7fd389d6fd61]
[bt] (8) /home/zhaozhimiao/anaconda3/envs/GraphMAE_Ldd/lib/python3.9/site-packages/dgl/_ffi/_cy3/core.cpython-39-x86_64-linux-gnu.so(+0x1e150) [0x7fd389d70150]
和上篇文章同样的方法
在指定设备这条语句之前加上torch.cuda.set_device()就可以了
也就是上图的第三行之前加上第二行,本来程序里面是没有第二行的