Due to the server cuda version, I was unable to use the pytorch1.0.0 version requested by the author. So I upgraded PyTorch to 1.9.1. But then it didn't work.How can I deal with this problem.
_
/opt/conda/envs/semiseg2/lib/python3.6/site-packages/torch/distributed/launch.py:186: FutureWarning: The module torch.distributed.launch is deprecated
and will be removed in future. Use torch.distributed.run.
Note that --use_env is set by default in torch.distributed.run.
If your script expects --local_rank
argument to be set, please
change it to read from os.environ['LOCAL_RANK']
instead. See
https://pytorch.org/docs/stable/distributed.html#launch-utility for
further instructions
FutureWarning,
WARNING:torch.distributed.run:*****************************************
Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
Traceback (most recent call last):
File "train.py", line 28, in
Traceback (most recent call last):
File "train.py", line 28, in
from apex.parallel import DistributedDataParallel, SyncBatchNorm
from apex.parallel import DistributedDataParallel, SyncBatchNorm
File "/opt/conda/envs/semiseg2/lib/python3.6/site-packages/apex-0.1-py3.6-linux-x86_64.egg/apex/init.py", line 12, in
File "/opt/conda/envs/semiseg2/lib/python3.6/site-packages/apex-0.1-py3.6-linux-x86_64.egg/apex/init.py", line 12, in
from . import optimizers
File "/opt/conda/envs/semiseg2/lib/python3.6/site-packages/apex-0.1-py3.6-linux-x86_64.egg/apex/optimizers/init.py", line 2, in
from . import optimizers
File "/opt/conda/envs/semiseg2/lib/python3.6/site-packages/apex-0.1-py3.6-linux-x86_64.egg/apex/optimizers/init.py", line 2, in
from .fp16_optimizer import FP16_Optimizer
File "/opt/conda/envs/semiseg2/lib/python3.6/site-packages/apex-0.1-py3.6-linux-x86_64.egg/apex/optimizers/fp16_optimizer.py", line 8, in
from .fp16_optimizer import FP16_Optimizer
File "/opt/conda/envs/semiseg2/lib/python3.6/site-packages/apex-0.1-py3.6-linux-x86_64.egg/apex/optimizers/fp16_optimizer.py", line 8, in
lib.THCudaHalfTensor_normall.argtypes=[ctypes.c_void_p, ctypes.c_void_p]
File "/opt/conda/envs/semiseg2/lib/python3.6/ctypes/init.py", line 361, in getattr
lib.THCudaHalfTensor_normall.argtypes=[ctypes.c_void_p, ctypes.c_void_p]
File "/opt/conda/envs/semiseg2/lib/python3.6/ctypes/init.py", line 361, in getattr
func = self.getitem(name)
File "/opt/conda/envs/semiseg2/lib/python3.6/ctypes/init.py", line 366, in getitem
func = self.getitem(name)
File "/opt/conda/envs/semiseg2/lib/python3.6/ctypes/init.py", line 366, in getitem
func = self._FuncPtr((name_or_ordinal, self))
AttributeError: /opt/conda/envs/semiseg2/bin/python: undefined symbol: THCudaHalfTensor_normall
func = self._FuncPtr((name_or_ordinal, self))
AttributeError: /opt/conda/envs/semiseg2/bin/python: undefined symbol: THCudaHalfTensor_normall
Traceback (most recent call last):
File "train.py", line 28, in
from apex.parallel import DistributedDataParallel, SyncBatchNorm
File "/opt/conda/envs/semiseg2/lib/python3.6/site-packages/apex-0.1-py3.6-linux-x86_64.egg/apex/init.py", line 12, in
from . import optimizers
File "/opt/conda/envs/semiseg2/lib/python3.6/site-packages/apex-0.1-py3.6-linux-x86_64.egg/apex/optimizers/init.py", line 2, in
from .fp16_optimizer import FP16_Optimizer
File "/opt/conda/envs/semiseg2/lib/python3.6/site-packages/apex-0.1-py3.6-linux-x86_64.egg/apex/optimizers/fp16_optimizer.py", line 8, in
lib.THCudaHalfTensor_normall.argtypes=[ctypes.c_void_p, ctypes.c_void_p]
File "/opt/conda/envs/semiseg2/lib/python3.6/ctypes/init.py", line 361, in getattr
func = self.getitem(name)
File "/opt/conda/envs/semiseg2/lib/python3.6/ctypes/init.py", line 366, in getitem
func = self._FuncPtr((name_or_ordinal, self))
AttributeError: /opt/conda/envs/semiseg2/bin/python: undefined symbol: THCudaHalfTensor_normall
Traceback (most recent call last):
File "train.py", line 28, in
from apex.parallel import DistributedDataParallel, SyncBatchNorm
File "/opt/conda/envs/semiseg2/lib/python3.6/site-packages/apex-0.1-py3.6-linux-x86_64.egg/apex/init.py", line 12, in
from . import optimizers
File "/opt/conda/envs/semiseg2/lib/python3.6/site-packages/apex-0.1-py3.6-linux-x86_64.egg/apex/optimizers/init.py", line 2, in
from .fp16_optimizer import FP16_Optimizer
File "/opt/conda/envs/semiseg2/lib/python3.6/site-packages/apex-0.1-py3.6-linux-x86_64.egg/apex/optimizers/fp16_optimizer.py", line 8, in
lib.THCudaHalfTensor_normall.argtypes=[ctypes.c_void_p, ctypes.c_void_p]
File "/opt/conda/envs/semiseg2/lib/python3.6/ctypes/init.py", line 361, in getattr
func = self.getitem(name)
File "/opt/conda/envs/semiseg2/lib/python3.6/ctypes/init.py", line 366, in getitem
func = self._FuncPtr((name_or_ordinal, self))
AttributeError: /opt/conda/envs/semiseg2/bin/python: undefined symbol: THCudaHalfTensor_normall
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 34178) of binary: /opt/conda/envs/semiseg2/bin/python
Traceback (most recent call last):
File "/opt/conda/envs/semiseg2/lib/python3.6/runpy.py", line 193, in _run_module_as_main
"main", mod_spec)
File "/opt/conda/envs/semiseg2/lib/python3.6/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/opt/conda/envs/semiseg2/lib/python3.6/site-packages/torch/distributed/launch.py", line 193, in
main()
File "/opt/conda/envs/semiseg2/lib/python3.6/site-packages/torch/distributed/launch.py", line 189, in main
launch(args)
File "/opt/conda/envs/semiseg2/lib/python3.6/site-packages/torch/distributed/launch.py", line 174, in launch
run(args)
File "/opt/conda/envs/semiseg2/lib/python3.6/site-packages/torch/distributed/run.py", line 692, in run
)(*cmd_args)
File "/opt/conda/envs/semiseg2/lib/python3.6/site-packages/torch/distributed/launcher/api.py", line 116, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/opt/conda/envs/semiseg2/lib/python3.6/site-packages/torch/distributed/launcher/api.py", line 246, in launch_agent
failures=result.failures,
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
=======================================
Root Cause:
[0]:
time: 2021-09-22_11:17:30
rank: 0 (local_rank: 0)
exitcode: 1 (pid: 34178)
error_file: <N/A>
msg: "Process failed with exitcode 1"
Other Failures:
[1]:
time: 2021-09-22_11:17:30
rank: 1 (local_rank: 1)
exitcode: 1 (pid: 34179)
error_file: <N/A>
msg: "Process failed with exitcode 1"
[2]:
time: 2021-09-22_11:17:30
rank: 2 (local_rank: 2)
exitcode: 1 (pid: 34180)
error_file: <N/A>
msg: "Process failed with exitcode 1"
[3]:
time: 2021-09-22_11:17:30
rank: 3 (local_rank: 3)
exitcode: 1 (pid: 34181)
error_file: <N/A>
msg: "Process failed with exitcode 1"
22 11:17:31 using devices 0, 1, 2, 3
_