# Traceback (most recent call last):
File "/share1/zouff/py_pro/Chinese-Mixtral/scripts/training/run_clm_sft_with_peft.py", line 424, in <module>
Traceback (most recent call last):
File "/share1/zouff/py_pro/Chinese-Mixtral/scripts/training/run_clm_sft_with_peft.py", line 424, in <module>
main()
File "/share1/zouff/py_pro/Chinese-Mixtral/scripts/training/run_clm_sft_with_peft.py", line 396, in main
main()train_result = trainer.train(resume_from_checkpoint=checkpoint)
File "/share1/zouff/py_pro/Chinese-Mixtral/scripts/training/run_clm_sft_with_peft.py", line 396, in main
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/transformers/trainer.py", line 1624, in train
Traceback (most recent call last):
File "/share1/zouff/py_pro/Chinese-Mixtral/scripts/training/run_clm_sft_with_peft.py", line 424, in <module>
main()
File "/share1/zouff/py_pro/Chinese-Mixtral/scripts/training/run_clm_sft_with_peft.py", line 396, in main
train_result = trainer.train(resume_from_checkpoint=checkpoint)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/transformers/trainer.py", line 1624, in train
train_result = trainer.train(resume_from_checkpoint=checkpoint)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/transformers/trainer.py", line 1624, in train
return inner_training_loop(
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/transformers/trainer.py", line 1961, in _inner_training_loop
tr_loss_step = self.training_step(model, inputs)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/transformers/trainer.py", line 2911, in training_step
return inner_training_loop(
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/transformers/trainer.py", line 1961, in _inner_training_loop
self.accelerator.backward(loss)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/accelerate/accelerator.py", line 1960, in backward
return inner_training_loop(
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/transformers/trainer.py", line 1961, in _inner_training_loop
self.deepspeed_engine_wrapped.backward(loss, **kwargs)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/accelerate/utils/deepspeed.py", line 167, in backward
self.engine.backward(loss, **kwargs)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
ret_val = func(*args, **kwargs)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 1964, in backward
self.optimizer.backward(loss, retain_graph=retain_graph)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 2040, in backward
tr_loss_step = self.training_step(model, inputs)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/transformers/trainer.py", line 2911, in training_step
self.loss_scaler.backward(loss.float(), retain_graph=retain_graph)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/deepspeed/runtime/fp16/loss_scaler.py", line 63, in backward
scaled_loss.backward(retain_graph=retain_graph)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/torch/_tensor.py", line 492, in backward
torch.autograd.backward(
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/torch/autograd/__init__.py", line 251, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/torch/autograd/function.py", line 288, in apply
return user_fn(self, *args)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 288, in backward
tr_loss_step = self.training_step(model, inputs)torch.autograd.backward(outputs_with_grad, args_with_grad)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/torch/autograd/__init__.py", line 251, in backward
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/transformers/trainer.py", line 2911, in training_step
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/torch/autograd/function.py", line 288, in apply
return user_fn(self, *args)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py", line 485, in backward
.mul_(state.SCB.unsqueeze(1).mul(1.0 / 127.0))
RuntimeError: The size of tensor a (32) must match the size of tensor b (8) at non-singleton dimension 0
self.accelerator.backward(loss)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/accelerate/accelerator.py", line 1960, in backward
self.accelerator.backward(loss)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/accelerate/accelerator.py", line 1960, in backward
self.deepspeed_engine_wrapped.backward(loss, **kwargs)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/accelerate/utils/deepspeed.py", line 167, in backward
self.engine.backward(loss, **kwargs)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
ret_val = func(*args, **kwargs)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 1964, in backward
self.deepspeed_engine_wrapped.backward(loss, **kwargs)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/accelerate/utils/deepspeed.py", line 167, in backward
self.engine.backward(loss, **kwargs)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
ret_val = func(*args, **kwargs)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 1964, in backward
Traceback (most recent call last):
Traceback (most recent call last):
File "/share1/zouff/py_pro/Chinese-Mixtral/scripts/training/run_clm_sft_with_peft.py", line 424, in <module>
File "/share1/zouff/py_pro/Chinese-Mixtral/scripts/training/run_clm_sft_with_peft.py", line 424, in <module>
self.optimizer.backward(loss, retain_graph=retain_graph)
Traceback (most recent call last):
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 2040, in backward
File "/share1/zouff/py_pro/Chinese-Mixtral/scripts/training/run_clm_sft_with_peft.py", line 424, in <module>
Traceback (most recent call last):
File "/share1/zouff/py_pro/Chinese-Mixtral/scripts/training/run_clm_sft_with_peft.py", line 424, in <module>
main()main()
File "/share1/zouff/py_pro/Chinese-Mixtral/scripts/training/run_clm_sft_with_peft.py", line 396, in main
File "/share1/zouff/py_pro/Chinese-Mixtral/scripts/training/run_clm_sft_with_peft.py", line 396, in main
self.optimizer.backward(loss, retain_graph=retain_graph)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 2040, in backward
main()
File "/share1/zouff/py_pro/Chinese-Mixtral/scripts/training/run_clm_sft_with_peft.py", line 396, in main
main()
File "/share1/zouff/py_pro/Chinese-Mixtral/scripts/training/run_clm_sft_with_peft.py", line 396, in main
train_result = trainer.train(resume_from_checkpoint=checkpoint)
train_result = trainer.train(resume_from_checkpoint=checkpoint) File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/transformers/trainer.py", line 1624, in train
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/transformers/trainer.py", line 1624, in train
self.loss_scaler.backward(loss.float(), retain_graph=retain_graph)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/deepspeed/runtime/fp16/loss_scaler.py", line 63, in backward
scaled_loss.backward(retain_graph=retain_graph)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/torch/_tensor.py", line 492, in backward
train_result = trainer.train(resume_from_checkpoint=checkpoint)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/transformers/trainer.py", line 1624, in train
train_result = trainer.train(resume_from_checkpoint=checkpoint)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/transformers/trainer.py", line 1624, in train
torch.autograd.backward(
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/torch/autograd/__init__.py", line 251, in backward
self.loss_scaler.backward(loss.float(), retain_graph=retain_graph)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/deepspeed/runtime/fp16/loss_scaler.py", line 63, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/torch/autograd/function.py", line 288, in apply
scaled_loss.backward(retain_graph=retain_graph)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/torch/_tensor.py", line 492, in backward
return user_fn(self, *args)
return inner_training_loop( File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 288, in backward
return inner_training_loop( File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/transformers/trainer.py", line 1961, in _inner_training_loop
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/transformers/trainer.py", line 1961, in _inner_training_loop
torch.autograd.backward(
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/torch/autograd/__init__.py", line 251, in backward
torch.autograd.backward(outputs_with_grad, args_with_grad)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/torch/autograd/__init__.py", line 251, in backward
return inner_training_loop(
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/transformers/trainer.py", line 1961, in _inner_training_loop
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward passreturn inner_training_loop(Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/torch/autograd/function.py", line 288, in apply
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/transformers/trainer.py", line 1961, in _inner_training_loop
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/torch/autograd/function.py", line 288, in apply
return user_fn(self, *args)return user_fn(self, *args)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py", line 485, in backward
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 288, in backward
torch.autograd.backward(outputs_with_grad, args_with_grad)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/torch/autograd/__init__.py", line 251, in backward
.mul_(state.SCB.unsqueeze(1).mul(1.0 / 127.0))
RuntimeErrortr_loss_step = self.training_step(model, inputs):
The size of tensor a (32) must match the size of tensor b (8) at non-singleton dimension 0 File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/transformers/trainer.py", line 2911, in training_step
tr_loss_step = self.training_step(model, inputs)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/transformers/trainer.py", line 2911, in training_step
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/torch/autograd/function.py", line 288, in apply
tr_loss_step = self.training_step(model, inputs)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/transformers/trainer.py", line 2911, in training_step
return user_fn(self, *args)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py", line 485, in backward
tr_loss_step = self.training_step(model, inputs)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/transformers/trainer.py", line 2911, in training_step
.mul_(state.SCB.unsqueeze(1).mul(1.0 / 127.0))
RuntimeError: The size of tensor a (32) must match the size of tensor b (8) at non-singleton dimension 0
self.accelerator.backward(loss)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/accelerate/accelerator.py", line 1960, in backward
self.accelerator.backward(loss)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/accelerate/accelerator.py", line 1960, in backward
self.accelerator.backward(loss)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/accelerate/accelerator.py", line 1960, in backward
self.accelerator.backward(loss)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/accelerate/accelerator.py", line 1960, in backward
self.deepspeed_engine_wrapped.backward(loss, **kwargs)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/accelerate/utils/deepspeed.py", line 167, in backward
self.deepspeed_engine_wrapped.backward(loss, **kwargs)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/accelerate/utils/deepspeed.py", line 167, in backward
self.engine.backward(loss, **kwargs)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
self.engine.backward(loss, **kwargs)
self.deepspeed_engine_wrapped.backward(loss, **kwargs) File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
ret_val = func(*args, **kwargs) File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/accelerate/utils/deepspeed.py", line 167, in backward
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 1964, in backward
ret_val = func(*args, **kwargs)
self.deepspeed_engine_wrapped.backward(loss, **kwargs) File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 1964, in backward
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/accelerate/utils/deepspeed.py", line 167, in backward
self.engine.backward(loss, **kwargs)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
self.engine.backward(loss, **kwargs)ret_val = func(*args, **kwargs)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 1964, in backward
ret_val = func(*args, **kwargs)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 1964, in backward
self.optimizer.backward(loss, retain_graph=retain_graph)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 2040, in backward
self.optimizer.backward(loss, retain_graph=retain_graph)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 2040, in backward
self.optimizer.backward(loss, retain_graph=retain_graph)
self.optimizer.backward(loss, retain_graph=retain_graph) File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 2040, in backward
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 2040, in backward
self.loss_scaler.backward(loss.float(), retain_graph=retain_graph)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/deepspeed/runtime/fp16/loss_scaler.py", line 63, in backward
self.loss_scaler.backward(loss.float(), retain_graph=retain_graph)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/deepspeed/runtime/fp16/loss_scaler.py", line 63, in backward
scaled_loss.backward(retain_graph=retain_graph)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/torch/_tensor.py", line 492, in backward
scaled_loss.backward(retain_graph=retain_graph)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/torch/_tensor.py", line 492, in backward
self.loss_scaler.backward(loss.float(), retain_graph=retain_graph)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/deepspeed/runtime/fp16/loss_scaler.py", line 63, in backward
torch.autograd.backward(
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/torch/autograd/__init__.py", line 251, in backward
scaled_loss.backward(retain_graph=retain_graph)
self.loss_scaler.backward(loss.float(), retain_graph=retain_graph) File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/torch/_tensor.py", line 492, in backward
torch.autograd.backward( File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/deepspeed/runtime/fp16/loss_scaler.py", line 63, in backward
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/torch/autograd/__init__.py", line 251, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/torch/autograd/function.py", line 288, in apply
scaled_loss.backward(retain_graph=retain_graph)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/torch/_tensor.py", line 492, in backward
torch.autograd.backward(
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/torch/autograd/__init__.py", line 251, in backward
return user_fn(self, *args) File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/torch/autograd/function.py", line 288, in apply
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 288, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/torch/autograd/function.py", line 288, in apply
return user_fn(self, *args)torch.autograd.backward(torch.autograd.backward(outputs_with_grad, args_with_grad)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 288, in backward
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/torch/autograd/__init__.py", line 251, in backward
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/torch/autograd/__init__.py", line 251, in backward
return user_fn(self, *args)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 288, in backward
torch.autograd.backward(outputs_with_grad, args_with_grad)Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/torch/autograd/__init__.py", line 251, in backward
torch.autograd.backward(outputs_with_grad, args_with_grad) File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/torch/autograd/function.py", line 288, in apply
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/torch/autograd/function.py", line 288, in apply
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/torch/autograd/__init__.py", line 251, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward passreturn user_fn(self, *args) return user_fn(self, *args)
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/torch/autograd/function.py", line 288, in apply
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py", line 485, in backward
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 288, in backward
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/torch/autograd/function.py", line 288, in apply
return user_fn(self, *args) torch.autograd.backward(outputs_with_grad, args_with_grad)
return user_fn(self, *args)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py", line 485, in backward
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/torch/autograd/__init__.py", line 251, in backward
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py", line 485, in backward
.mul_(state.SCB.unsqueeze(1).mul(1.0 / 127.0))
RuntimeError: The size of tensor a (32) must match the size of tensor b (8) at non-singleton dimension 0
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/torch/autograd/function.py", line 288, in apply
.mul_(state.SCB.unsqueeze(1).mul(1.0 / 127.0))
.mul_(state.SCB.unsqueeze(1).mul(1.0 / 127.0))
RuntimeError: RuntimeErrorThe size of tensor a (32) must match the size of tensor b (8) at non-singleton dimension 0:
The size of tensor a (32) must match the size of tensor b (8) at non-singleton dimension 0return user_fn(self, *args)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py", line 485, in backward
.mul_(state.SCB.unsqueeze(1).mul(1.0 / 127.0))
RuntimeError: The size of tensor a (32) must match the size of tensor b (8) at non-singleton dimension 0
Traceback (most recent call last):
File "/share1/zouff/py_pro/Chinese-Mixtral/scripts/training/run_clm_sft_with_peft.py", line 424, in <module>
main()
File "/share1/zouff/py_pro/Chinese-Mixtral/scripts/training/run_clm_sft_with_peft.py", line 396, in main
train_result = trainer.train(resume_from_checkpoint=checkpoint)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/transformers/trainer.py", line 1624, in train
return inner_training_loop(
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/transformers/trainer.py", line 1961, in _inner_training_loop
tr_loss_step = self.training_step(model, inputs)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/transformers/trainer.py", line 2911, in training_step
self.accelerator.backward(loss)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/accelerate/accelerator.py", line 1960, in backward
self.deepspeed_engine_wrapped.backward(loss, **kwargs)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/accelerate/utils/deepspeed.py", line 167, in backward
self.engine.backward(loss, **kwargs)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
ret_val = func(*args, **kwargs)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 1964, in backward
self.optimizer.backward(loss, retain_graph=retain_graph)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 2040, in backward
self.loss_scaler.backward(loss.float(), retain_graph=retain_graph)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/deepspeed/runtime/fp16/loss_scaler.py", line 63, in backward
scaled_loss.backward(retain_graph=retain_graph)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/torch/_tensor.py", line 492, in backward
torch.autograd.backward(
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/torch/autograd/__init__.py", line 251, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/torch/autograd/function.py", line 288, in apply
return user_fn(self, *args)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 288, in backward
torch.autograd.backward(outputs_with_grad, args_with_grad)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/torch/autograd/__init__.py", line 251, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/torch/autograd/function.py", line 288, in apply
return user_fn(self, *args)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py", line 485, in backward
.mul_(state.SCB.unsqueeze(1).mul(1.0 / 127.0))
RuntimeError: The size of tensor a (32) must match the size of tensor b (8) at non-singleton dimension 0
0%| | 0/3810 [00:03<?, ?it/s]
[2024-03-21 09:55:50,608] torch.distributed.elastic.multiprocessing.api: [ERROR] failed (exitcode: 1) local_rank: 0 (pid: 34314) of binary: /home/zouff/anaconda3/envs/glm/bin/python
Traceback (most recent call last):
File "/home/zouff/anaconda3/envs/glm/bin/torchrun", line 8, in <module>
sys.exit(main())
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 346, in wrapper
return f(*args, **kwargs)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/torch/distributed/run.py", line 806, in main
run(args)
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/torch/distributed/run.py", line 797, in run
elastic_launch(
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 134, in __call__
return launch_agent(self._config, self._entrypoint, list(args))
File "/home/zouff/anaconda3/envs/glm/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 264, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
============================================================
run_clm_sft_with_peft.py FAILED
------------------------------------------------------------
Failures:
[1]:
time : 2024-03-21_09:55:50
host : g01
rank : 1 (local_rank: 1)
exitcode : 1 (pid: 34315)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[2]:
time : 2024-03-21_09:55:50
host : g01
rank : 2 (local_rank: 2)
exitcode : 1 (pid: 34316)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[3]:
time : 2024-03-21_09:55:50
host : g01
rank : 3 (local_rank: 3)
exitcode : 1 (pid: 34317)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[4]:
time : 2024-03-21_09:55:50
host : g01
rank : 4 (local_rank: 4)
exitcode : 1 (pid: 34318)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[5]:
time : 2024-03-21_09:55:50
host : g01
rank : 5 (local_rank: 5)
exitcode : 1 (pid: 34319)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[6]:
time : 2024-03-21_09:55:50
host : g01
rank : 6 (local_rank: 6)
exitcode : 1 (pid: 34320)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[7]:
time : 2024-03-21_09:55:50
host : g01
rank : 7 (local_rank: 7)
exitcode : 1 (pid: 34321)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
time : 2024-03-21_09:55:50
host : g01
rank : 0 (local_rank: 0)
exitcode : 1 (pid: 34314)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
============================================================