关闭指定pid程序, utils.sh

本文介绍了一套实用的Shell脚本技巧,包括日志管理、进程控制、文件操作及NFS挂载等关键功能。通过具体示例展示了如何创建日志文件、管理进程PID、检查并安装sudoers文件以及确保文件包含特定行等内容。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >


mkdir -p /var/vcap/sys/log

exec > >(tee -a >(logger -p user.info -t vcap.$(basename $0).stdout) >>/var/vcap/sys/log/$(basename $0).log)
exec 2> >(tee -a >(logger -p user.error -t vcap.$(basename $0).stderr) >>/var/vcap/sys/log/$(basename $0).err.log)

pid_guard() {
  echo "------------ STARTING `basename $0` at `date` --------------" | tee /dev/stderr
  pidfile=$1
  name=$2

  if [ -f "$pidfile" ]; then
    pid=$(head -1 "$pidfile")

    if [ -n "$pid" ] && [ -e /proc/$pid ]; then
      echo "$name is already running, please stop it first"
      exit 1
    fi

    echo "Removing stale pidfile..."
    rm $pidfile
  fi
}

wait_pidfile() {
  pidfile=$1
  try_kill=$2
  timeout=${3:-0}
  force=${4:-0}
  countdown=$(( $timeout * 10 ))

  if [ -f "$pidfile" ]; then
    pid=$(head -1 "$pidfile")

    if [ -z "$pid" ]; then
      echo "Unable to get pid from $pidfile"
      exit 1
    fi

    if [ -e /proc/$pid ]; then
      if [ "$try_kill" = "1" ]; then
        echo "Killing $pidfile: $pid "
        kill $pid
      fi
      while [ -e /proc/$pid ]; do
        sleep 0.1
        [ "$countdown" != '0' -a $(( $countdown % 10 )) = '0' ] && echo -n .
        if [ $timeout -gt 0 ]; then
          if [ $countdown -eq 0 ]; then
            if [ "$force" = "1" ]; then
              echo -ne "\nKill timed out, using kill -9 on $pid... "
              kill -9 $pid
              sleep 0.5
            fi
            break
          else
            countdown=$(( $countdown - 1 ))
          fi
        fi
      done
      if [ -e /proc/$pid ]; then
        echo "Timed Out"
      else
        echo "Stopped"
      fi
    else
      echo "Process $pid is not running"
    fi

    rm -f $pidfile
  else
    echo "Pidfile $pidfile doesn't exist"
  fi
}

kill_and_wait() {
  pidfile=$1
  # Monit default timeout for start/stop is 30s
  # Append 'with timeout {n} seconds' to monit start/stop program configs
  timeout=${2:-25}
  force=${3:-1}

  wait_pidfile $pidfile 1 $timeout $force
}

check_mount() {
  opts=$1
  exports=$2
  mount_point=$3

  if grep -qs $mount_point /proc/mounts; then
    echo "Found NFS mount $mount_point"
  else
    echo "Mounting NFS..."
    mount $opts $exports $mount_point
    if [ $? != 0 ]; then
      echo "Cannot mount NFS from $exports to $mount_point, exiting..."
      exit 1
    fi
  fi
}

# Check the syntax of a sudoers file.
check_sudoers() {
  /usr/sbin/visudo -c -f "$1"
}

# Check the syntax of a sudoers file and if it's ok install it.
install_sudoers() {
  src="$1"
  dest="$2"

  check_sudoers "$src"

  if [ $? -eq 0 ]; then
    chown root:root "$src"
    chmod 0440 "$src"
    cp -p "$src" "$dest"
  else
    echo "Syntax error in sudoers file $src"
    exit 1
  fi
}

# Add a line to a file if it is not already there.
file_must_include() {
  file="$1"
  line="$2"

  # Protect against empty $file so it doesn't wait for input on stdin.
  if [ -n "$file" ]; then
    grep --quiet "$line" "$file" || echo "$line" >> "$file"
  else
    echo 'File name is required'
    exit 1
  fi
}


root@b82c95bfc43f:/workspace/data-zj/test_code_v1# . train.sh /usr/local/lib/python3.10/dist-packages/torch/utils/_pytree.py:185: FutureWarning: optree is installed but the version is too old to support PyTorch Dynamo in C++ pytree. C++ pytree support is disabled. Please consider upgrading optree using `python3 -m pip install --upgrade 'optree>=0.13.0'`. warnings.warn( W0529 02:52:35.084000 938 torch/distributed/run.py:792] W0529 02:52:35.084000 938 torch/distributed/run.py:792] ***************************************** W0529 02:52:35.084000 938 torch/distributed/run.py:792] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. W0529 02:52:35.084000 938 torch/distributed/run.py:792] ***************************************** [W529 02:52:39.311003207 socket.cpp:204] [c10d] The hostname of the client socket cannot be retrieved. err=-3 [W529 02:52:43.313869930 socket.cpp:204] [c10d] The hostname of the client socket cannot be retrieved. err=-3 /usr/local/lib/python3.10/dist-packages/torch/utils/_pytree.py:185: FutureWarning: optree is installed but the version is too old to support PyTorch Dynamo in C++ pytree. C++ pytree support is disabled. Please consider upgrading optree using `python3 -m pip install --upgrade 'optree>=0.13.0'`. warnings.warn( /usr/local/lib/python3.10/dist-packages/torch/utils/_pytree.py:185: FutureWarning: optree is installed but the version is too old to support PyTorch Dynamo in C++ pytree. C++ pytree support is disabled. Please consider upgrading optree using `python3 -m pip install --upgrade 'optree>=0.13.0'`. warnings.warn( /usr/local/lib/python3.10/dist-packages/torch/utils/_pytree.py:185: FutureWarning: optree is installed but the version is too old to support PyTorch Dynamo in C++ pytree. C++ pytree support is disabled. Please consider upgrading optree using `python3 -m pip install --upgrade 'optree>=0.13.0'`. warnings.warn( /usr/local/lib/python3.10/dist-packages/torch/utils/_pytree.py:185: FutureWarning: optree is installed but the version is too old to support PyTorch Dynamo in C++ pytree. C++ pytree support is disabled. Please consider upgrading optree using `python3 -m pip install --upgrade 'optree>=0.13.0'`. warnings.warn( /usr/local/lib/python3.10/dist-packages/torch/utils/_pytree.py:185: FutureWarning: optree is installed but the version is too old to support PyTorch Dynamo in C++ pytree. C++ pytree support is disabled. Please consider upgrading optree using `python3 -m pip install --upgrade 'optree>=0.13.0'`. warnings.warn( /usr/local/lib/python3.10/dist-packages/torch/utils/_pytree.py:185: FutureWarning: optree is installed but the version is too old to support PyTorch Dynamo in C++ pytree. C++ pytree support is disabled. Please consider upgrading optree using `python3 -m pip install --upgrade 'optree>=0.13.0'`. warnings.warn( /usr/local/lib/python3.10/dist-packages/torch/utils/_pytree.py:185: FutureWarning: optree is installed but the version is too old to support PyTorch Dynamo in C++ pytree. C++ pytree support is disabled. Please consider upgrading optree using `python3 -m pip install --upgrade 'optree>=0.13.0'`. warnings.warn( /usr/local/lib/python3.10/dist-packages/torch/utils/_pytree.py:185: FutureWarning: optree is installed but the version is too old to support PyTorch Dynamo in C++ pytree. C++ pytree support is disabled. Please consider upgrading optree using `python3 -m pip install --upgrade 'optree>=0.13.0'`. warnings.warn( /usr/local/lib/python3.10/dist-packages/torch/utils/_pytree.py:185: FutureWarning: optree is installed but the version is too old to support PyTorch Dynamo in C++ pytree. C++ pytree support is disabled. Please consider upgrading optree using `python3 -m pip install --upgrade 'optree>=0.13.0'`. warnings.warn( /usr/local/lib/python3.10/dist-packages/torch/utils/_pytree.py:185: FutureWarning: optree is installed but the version is too old to support PyTorch Dynamo in C++ pytree. C++ pytree support is disabled. Please consider upgrading optree using `python3 -m pip install --upgrade 'optree>=0.13.0'`. warnings.warn( /usr/local/lib/python3.10/dist-packages/torch/utils/_pytree.py:185: FutureWarning: optree is installed but the version is too old to support PyTorch Dynamo in C++ pytree. C++ pytree support is disabled. Please consider upgrading optree using `python3 -m pip install --upgrade 'optree>=0.13.0'`. warnings.warn( /usr/local/lib/python3.10/dist-packages/torch/utils/_pytree.py:185: FutureWarning: optree is installed but the version is too old to support PyTorch Dynamo in C++ pytree. C++ pytree support is disabled. Please consider upgrading optree using `python3 -m pip install --upgrade 'optree>=0.13.0'`. warnings.warn( /usr/local/lib/python3.10/dist-packages/torch/utils/_pytree.py:185: FutureWarning: optree is installed but the version is too old to support PyTorch Dynamo in C++ pytree. C++ pytree support is disabled. Please consider upgrading optree using `python3 -m pip install --upgrade 'optree>=0.13.0'`. warnings.warn( /usr/local/lib/python3.10/dist-packages/torch/utils/_pytree.py:185: FutureWarning: optree is installed but the version is too old to support PyTorch Dynamo in C++ pytree. C++ pytree support is disabled. Please consider upgrading optree using `python3 -m pip install --upgrade 'optree>=0.13.0'`. warnings.warn( /usr/local/lib/python3.10/dist-packages/torch/utils/_pytree.py:185: FutureWarning: optree is installed but the version is too old to support PyTorch Dynamo in C++ pytree. C++ pytree support is disabled. Please consider upgrading optree using `python3 -m pip install --upgrade 'optree>=0.13.0'`. warnings.warn( /usr/local/lib/python3.10/dist-packages/torch/utils/_pytree.py:185: FutureWarning: optree is installed but the version is too old to support PyTorch Dynamo in C++ pytree. C++ pytree support is disabled. Please consider upgrading optree using `python3 -m pip install --upgrade 'optree>=0.13.0'`. warnings.warn( Traceback (most recent call last): File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main return _run_code(code, main_globals, None, File "/usr/lib/python3.10/runpy.py", line 86, in _run_code exec(code, run_globals) File "/usr/local/lib/python3.10/dist-packages/axolotl/cli/train.py", line 17, in <module> from axolotl.cli.config import load_cfg File "/usr/local/lib/python3.10/dist-packages/axolotl/cli/config.py", line 19, in <module> from axolotl.utils.config import ( File "/usr/local/lib/python3.10/dist-packages/axolotl/utils/config/__init__.py", line 16, in <module> from axolotl.utils.models import MULTIMODAL_AUTO_MODEL_MAPPING, load_model_config File "/usr/local/lib/python3.10/dist-packages/axolotl/utils/models.py", line 17, in <module> import transformers.modeling_utils File "/usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py", line 62, in <module> from .integrations.flash_attention import flash_attention_forward File "/usr/local/lib/python3.10/dist-packages/transformers/integrations/flash_attention.py", line 5, in <module> from ..modeling_flash_attention_utils import _flash_attention_forward, flash_attn_supports_top_left_mask File "/usr/local/lib/python3.10/dist-packages/transformers/modeling_flash_attention_utils.py", line 36, in <module> from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa File "/usr/local/lib/python3.10/dist-packages/flash_attn/__init__.py", line 3, in <module> from flash_attn.flash_attn_interface import ( File "/usr/local/lib/python3.10/dist-packages/flash_attn/flash_attn_interface.py", line 10, in <module> import flash_attn_2_cuda as flash_attn_cuda ImportError: /usr/local/lib/python3.10/dist-packages/flash_attn_2_cuda.cpython-310-x86_64-linux-gnu.so: undefined symbol: _ZN3c105ErrorC2ENS_14SourceLocationENSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE Traceback (most recent call last): File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main return _run_code(code, main_globals, None, File "/usr/lib/python3.10/runpy.py", line 86, in _run_code exec(code, run_globals) File "/usr/local/lib/python3.10/dist-packages/axolotl/cli/train.py", line 17, in <module> from axolotl.cli.config import load_cfg File "/usr/local/lib/python3.10/dist-packages/axolotl/cli/config.py", line 19, in <module> from axolotl.utils.config import ( File "/usr/local/lib/python3.10/dist-packages/axolotl/utils/config/__init__.py", line 16, in <module> from axolotl.utils.models import MULTIMODAL_AUTO_MODEL_MAPPING, load_model_config File "/usr/local/lib/python3.10/dist-packages/axolotl/utils/models.py", line 17, in <module> import transformers.modeling_utils File "/usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py", line 62, in <module> from .integrations.flash_attention import flash_attention_forward File "/usr/local/lib/python3.10/dist-packages/transformers/integrations/flash_attention.py", line 5, in <module> from ..modeling_flash_attention_utils import _flash_attention_forward, flash_attn_supports_top_left_mask File "/usr/local/lib/python3.10/dist-packages/transformers/modeling_flash_attention_utils.py", line 36, in <module> from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa File "/usr/local/lib/python3.10/dist-packages/flash_attn/__init__.py", line 3, in <module> from flash_attn.flash_attn_interface import ( File "/usr/local/lib/python3.10/dist-packages/flash_attn/flash_attn_interface.py", line 10, in <module> import flash_attn_2_cuda as flash_attn_cuda ImportError: /usr/local/lib/python3.10/dist-packages/flash_attn_2_cuda.cpython-310-x86_64-linux-gnu.so: undefined symbol: _ZN3c105ErrorC2ENS_14SourceLocationENSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE Traceback (most recent call last): File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main return _run_code(code, main_globals, None, File "/usr/lib/python3.10/runpy.py", line 86, in _run_code exec(code, run_globals) File "/usr/local/lib/python3.10/dist-packages/axolotl/cli/train.py", line 17, in <module> from axolotl.cli.config import load_cfg File "/usr/local/lib/python3.10/dist-packages/axolotl/cli/config.py", line 19, in <module> from axolotl.utils.config import ( File "/usr/local/lib/python3.10/dist-packages/axolotl/utils/config/__init__.py", line 16, in <module> from axolotl.utils.models import MULTIMODAL_AUTO_MODEL_MAPPING, load_model_config File "/usr/local/lib/python3.10/dist-packages/axolotl/utils/models.py", line 17, in <module> import transformers.modeling_utils File "/usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py", line 62, in <module> from .integrations.flash_attention import flash_attention_forward File "/usr/local/lib/python3.10/dist-packages/transformers/integrations/flash_attention.py", line 5, in <module> from ..modeling_flash_attention_utils import _flash_attention_forward, flash_attn_supports_top_left_mask File "/usr/local/lib/python3.10/dist-packages/transformers/modeling_flash_attention_utils.py", line 36, in <module> from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa File "/usr/local/lib/python3.10/dist-packages/flash_attn/__init__.py", line 3, in <module> from flash_attn.flash_attn_interface import ( File "/usr/local/lib/python3.10/dist-packages/flash_attn/flash_attn_interface.py", line 10, in <module> import flash_attn_2_cuda as flash_attn_cuda ImportError: /usr/local/lib/python3.10/dist-packages/flash_attn_2_cuda.cpython-310-x86_64-linux-gnu.so: undefined symbol: _ZN3c105ErrorC2ENS_14SourceLocationENSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE Traceback (most recent call last): File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main return _run_code(code, main_globals, None, File "/usr/lib/python3.10/runpy.py", line 86, in _run_code exec(code, run_globals) File "/usr/local/lib/python3.10/dist-packages/axolotl/cli/train.py", line 17, in <module> from axolotl.cli.config import load_cfg File "/usr/local/lib/python3.10/dist-packages/axolotl/cli/config.py", line 19, in <module> from axolotl.utils.config import ( File "/usr/local/lib/python3.10/dist-packages/axolotl/utils/config/__init__.py", line 16, in <module> from axolotl.utils.models import MULTIMODAL_AUTO_MODEL_MAPPING, load_model_config File "/usr/local/lib/python3.10/dist-packages/axolotl/utils/models.py", line 17, in <module> import transformers.modeling_utils File "/usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py", line 62, in <module> from .integrations.flash_attention import flash_attention_forward File "/usr/local/lib/python3.10/dist-packages/transformers/integrations/flash_attention.py", line 5, in <module> from ..modeling_flash_attention_utils import _flash_attention_forward, flash_attn_supports_top_left_mask File "/usr/local/lib/python3.10/dist-packages/transformers/modeling_flash_attention_utils.py", line 36, in <module> from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa File "/usr/local/lib/python3.10/dist-packages/flash_attn/__init__.py", line 3, in <module> from flash_attn.flash_attn_interface import ( File "/usr/local/lib/python3.10/dist-packages/flash_attn/flash_attn_interface.py", line 10, in <module> import flash_attn_2_cuda as flash_attn_cuda ImportError: /usr/local/lib/python3.10/dist-packages/flash_attn_2_cuda.cpython-310-x86_64-linux-gnu.so: undefined symbol: _ZN3c105ErrorC2ENS_14SourceLocationENSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE Traceback (most recent call last): File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main return _run_code(code, main_globals, None, File "/usr/lib/python3.10/runpy.py", line 86, in _run_code exec(code, run_globals) File "/usr/local/lib/python3.10/dist-packages/axolotl/cli/train.py", line 17, in <module> from axolotl.cli.config import load_cfg File "/usr/local/lib/python3.10/dist-packages/axolotl/cli/config.py", line 19, in <module> from axolotl.utils.config import ( File "/usr/local/lib/python3.10/dist-packages/axolotl/utils/config/__init__.py", line 16, in <module> from axolotl.utils.models import MULTIMODAL_AUTO_MODEL_MAPPING, load_model_config File "/usr/local/lib/python3.10/dist-packages/axolotl/utils/models.py", line 17, in <module> import transformers.modeling_utils File "/usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py", line 62, in <module> from .integrations.flash_attention import flash_attention_forward File "/usr/local/lib/python3.10/dist-packages/transformers/integrations/flash_attention.py", line 5, in <module> from ..modeling_flash_attention_utils import _flash_attention_forward, flash_attn_supports_top_left_mask File "/usr/local/lib/python3.10/dist-packages/transformers/modeling_flash_attention_utils.py", line 36, in <module> from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa File "/usr/local/lib/python3.10/dist-packages/flash_attn/__init__.py", line 3, in <module> from flash_attn.flash_attn_interface import ( File "/usr/local/lib/python3.10/dist-packages/flash_attn/flash_attn_interface.py", line 10, in <module> import flash_attn_2_cuda as flash_attn_cuda ImportError: /usr/local/lib/python3.10/dist-packages/flash_attn_2_cuda.cpython-310-x86_64-linux-gnu.so: undefined symbol: _ZN3c105ErrorC2ENS_14SourceLocationENSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE Traceback (most recent call last): File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main return _run_code(code, main_globals, None, File "/usr/lib/python3.10/runpy.py", line 86, in _run_code exec(code, run_globals) File "/usr/local/lib/python3.10/dist-packages/axolotl/cli/train.py", line 17, in <module> from axolotl.cli.config import load_cfg File "/usr/local/lib/python3.10/dist-packages/axolotl/cli/config.py", line 19, in <module> from axolotl.utils.config import ( File "/usr/local/lib/python3.10/dist-packages/axolotl/utils/config/__init__.py", line 16, in <module> from axolotl.utils.models import MULTIMODAL_AUTO_MODEL_MAPPING, load_model_config File "/usr/local/lib/python3.10/dist-packages/axolotl/utils/models.py", line 17, in <module> import transformers.modeling_utils File "/usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py", line 62, in <module> from .integrations.flash_attention import flash_attention_forward File "/usr/local/lib/python3.10/dist-packages/transformers/integrations/flash_attention.py", line 5, in <module> from ..modeling_flash_attention_utils import _flash_attention_forward, flash_attn_supports_top_left_mask File "/usr/local/lib/python3.10/dist-packages/transformers/modeling_flash_attention_utils.py", line 36, in <module> from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa File "/usr/local/lib/python3.10/dist-packages/flash_attn/__init__.py", line 3, in <module> from flash_attn.flash_attn_interface import ( File "/usr/local/lib/python3.10/dist-packages/flash_attn/flash_attn_interface.py", line 10, in <module> import flash_attn_2_cuda as flash_attn_cuda ImportError: /usr/local/lib/python3.10/dist-packages/flash_attn_2_cuda.cpython-310-x86_64-linux-gnu.so: undefined symbol: _ZN3c105ErrorC2ENS_14SourceLocationENSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE Traceback (most recent call last): File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main return _run_code(code, main_globals, None, File "/usr/lib/python3.10/runpy.py", line 86, in _run_code exec(code, run_globals) File "/usr/local/lib/python3.10/dist-packages/axolotl/cli/train.py", line 17, in <module> from axolotl.cli.config import load_cfg File "/usr/local/lib/python3.10/dist-packages/axolotl/cli/config.py", line 19, in <module> from axolotl.utils.config import ( File "/usr/local/lib/python3.10/dist-packages/axolotl/utils/config/__init__.py", line 16, in <module> from axolotl.utils.models import MULTIMODAL_AUTO_MODEL_MAPPING, load_model_config File "/usr/local/lib/python3.10/dist-packages/axolotl/utils/models.py", line 17, in <module> import transformers.modeling_utils File "/usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py", line 62, in <module> from .integrations.flash_attention import flash_attention_forward File "/usr/local/lib/python3.10/dist-packages/transformers/integrations/flash_attention.py", line 5, in <module> from ..modeling_flash_attention_utils import _flash_attention_forward, flash_attn_supports_top_left_mask File "/usr/local/lib/python3.10/dist-packages/transformers/modeling_flash_attention_utils.py", line 36, in <module> from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa File "/usr/local/lib/python3.10/dist-packages/flash_attn/__init__.py", line 3, in <module> from flash_attn.flash_attn_interface import ( File "/usr/local/lib/python3.10/dist-packages/flash_attn/flash_attn_interface.py", line 10, in <module> import flash_attn_2_cuda as flash_attn_cuda ImportError: /usr/local/lib/python3.10/dist-packages/flash_attn_2_cuda.cpython-310-x86_64-linux-gnu.so: undefined symbol: _ZN3c105ErrorC2ENS_14SourceLocationENSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE Traceback (most recent call last): File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main return _run_code(code, main_globals, None, File "/usr/lib/python3.10/runpy.py", line 86, in _run_code exec(code, run_globals) File "/usr/local/lib/python3.10/dist-packages/axolotl/cli/train.py", line 17, in <module> from axolotl.cli.config import load_cfg File "/usr/local/lib/python3.10/dist-packages/axolotl/cli/config.py", line 19, in <module> from axolotl.utils.config import ( File "/usr/local/lib/python3.10/dist-packages/axolotl/utils/config/__init__.py", line 16, in <module> from axolotl.utils.models import MULTIMODAL_AUTO_MODEL_MAPPING, load_model_config File "/usr/local/lib/python3.10/dist-packages/axolotl/utils/models.py", line 17, in <module> import transformers.modeling_utils File "/usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py", line 62, in <module> from .integrations.flash_attention import flash_attention_forward File "/usr/local/lib/python3.10/dist-packages/transformers/integrations/flash_attention.py", line 5, in <module> from ..modeling_flash_attention_utils import _flash_attention_forward, flash_attn_supports_top_left_mask File "/usr/local/lib/python3.10/dist-packages/transformers/modeling_flash_attention_utils.py", line 36, in <module> from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa File "/usr/local/lib/python3.10/dist-packages/flash_attn/__init__.py", line 3, in <module> from flash_attn.flash_attn_interface import ( File "/usr/local/lib/python3.10/dist-packages/flash_attn/flash_attn_interface.py", line 10, in <module> import flash_attn_2_cuda as flash_attn_cuda ImportError: /usr/local/lib/python3.10/dist-packages/flash_attn_2_cuda.cpython-310-x86_64-linux-gnu.so: undefined symbol: _ZN3c105ErrorC2ENS_14SourceLocationENSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE W0529 02:52:48.206000 938 torch/distributed/elastic/multiprocessing/api.py:897] Sending process 1003 closing signal SIGTERM W0529 02:52:48.207000 938 torch/distributed/elastic/multiprocessing/api.py:897] Sending process 1004 closing signal SIGTERM W0529 02:52:48.207000 938 torch/distributed/elastic/multiprocessing/api.py:897] Sending process 1005 closing signal SIGTERM W0529 02:52:48.207000 938 torch/distributed/elastic/multiprocessing/api.py:897] Sending process 1006 closing signal SIGTERM W0529 02:52:48.207000 938 torch/distributed/elastic/multiprocessing/api.py:897] Sending process 1007 closing signal SIGTERM W0529 02:52:48.207000 938 torch/distributed/elastic/multiprocessing/api.py:897] Sending process 1009 closing signal SIGTERM W0529 02:52:48.208000 938 torch/distributed/elastic/multiprocessing/api.py:897] Sending process 1010 closing signal SIGTERM E0529 02:52:48.385000 938 torch/distributed/elastic/multiprocessing/api.py:869] failed (exitcode: 1) local_rank: 5 (pid: 1008) of binary: /usr/bin/python Traceback (most recent call last): File "/usr/local/bin/torchrun", line 8, in <module> sys.exit(main()) File "/usr/local/lib/python3.10/dist-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper return f(*args, **kwargs) File "/usr/local/lib/python3.10/dist-packages/torch/distributed/run.py", line 918, in main run(args) File "/usr/local/lib/python3.10/dist-packages/torch/distributed/run.py", line 909, in run elastic_launch( File "/usr/local/lib/python3.10/dist-packages/torch/distributed/launcher/api.py", line 138, in __call__ return launch_agent(self._config, self._entrypoint, list(args)) File "/usr/local/lib/python3.10/dist-packages/torch/distributed/launcher/api.py", line 269, in launch_agent raise ChildFailedError( torch.distributed.elastic.multiprocessing.errors.ChildFailedError: ============================================================ axolotl.cli.train FAILED ------------------------------------------------------------ Failures: <NO_OTHER_FAILURES> ------------------------------------------------------------ Root Cause (first observed failure): [0]: time : 2025-05-29_02:52:48 host : b82c95bfc43f rank : 5 (local_rank: 5) exitcode : 1 (pid: 1008) error_file: <N/A> traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ============================================================
05-30
; write=0.006 s, sync=0.001 s, total=0.024 s; sync files=2, longest=0.001 s, average=0.001 s; distance=0 kB, estimate=0 kB 2025-07-25 11:25:49.627 UTC [1] LOG: database system is ready to accept connections 2025-07-25 11:25:49.628 UTC [11305] FATAL: could not open file "global/pg_filenode.map": No such file or directory 2025-07-25 11:25:49.628 UTC [11306] FATAL: could not open file "global/pg_filenode.map": No such file or directory 2025-07-25 11:25:49.629 UTC [1] LOG: autovacuum launcher process (PID 11305) exited with exit code 1 2025-07-25 11:25:49.629 UTC [1] LOG: terminating any other active server processes 2025-07-25 11:25:49.630 UTC [1] LOG: background worker "logical replication launcher" (PID 11306) exited with exit code 1 2025-07-25 11:25:49.631 UTC [1] LOG: all server processes terminated; reinitializing 2025-07-25 11:25:49.680 UTC [11307] LOG: database system was interrupted; last known up at 2025-07-25 11:25:49 UTC 2025-07-25 11:25:49.753 UTC [11307] LOG: database system was not properly shut down; automatic recovery in progress 2025-07-25 11:25:49.759 UTC [11307] LOG: invalid record length at 0/1478CF0: wanted 24, got 0 2025-07-25 11:25:49.759 UTC [11318] FATAL: the database system is in recovery mode 2025-07-25 11:25:49.759 UTC [11307] LOG: redo is not required 2025-07-25 11:25:49.765 UTC [11308] LOG: checkpoint starting: end-of-recovery immediate wait 2025-07-25 11:25:49.774 UTC [1] LOG: received smart shutdown request 2025-07-25 11:25:49.789 UTC [11308] LOG: checkpoint complete: wrote 3 buffers (0.0%); 0 WAL file(s) added, 0 removed, 0 recycled; write=0.008 s, sync=0.001 s, total=0.026 s; sync files=2, longest=0.001 s, average=0.001 s; distance=0 kB, estimate=0 kB 2025-07-25 11:25:49.793 UTC [11308] LOG: shutting down 2025-07-25 11:25:49.794 UTC [11308] LOG: checkpoint starting: shutdown immediate 2025-07-25 11:25:49.810 UTC [11308] LOG: checkpoint complete: wrote 0 buffers (0.0%); 0 WAL file(s) added, 0 removed, 0 recycled; write=0.002 s, sync=0.001 s, total=0.017 s; sync files=0, longest=0.000 s, average=0.000 s; distance=0 kB, estimate=0 kB 2025-07-25 11:25:49.826 UTC [1] LOG: database system is shut down [root@k8s-master1 harbor]# [root@k8s-master1 harbor]# kubectl logs harbor-core-75cd4f54b5-b5stj -n harbor --previous Appending internal tls trust CA to ca-bundle ... find: '/etc/harbor/ssl': No such file or directory Internal tls trust CA appending is Done. init global config instance failed. If you do not use this, just ignore it. open conf/app.conf: no such file or directory 2025-07-25T11:29:52Z [INFO] [/controller/artifact/annotation/parser.go:85]: the annotation parser to parser artifact annotation version v1alpha1 registered 2025-07-25T11:29:52Z [INFO] [/controller/artifact/processor/processor.go:59]: the processor to process media type application/vnd.cncf.helm.config.v1+json registered 2025-07-25T11:29:52Z [INFO] [/controller/artifact/processor/processor.go:59]: the processor to process media type application/vnd.cnab.manifest.v1 registered 2025-07-25T11:29:52Z [INFO] [/controller/artifact/processor/processor.go:59]: the processor to process media type application/vnd.cnai.model.manifest.v1+json registered 2025-07-25T11:29:52Z [INFO] [/controller/artifact/processor/processor.go:59]: the processor to process media type application/vnd.oci.image.index.v1+json registered 2025-07-25T11:29:52Z [INFO] [/controller/artifact/processor/processor.go:59]: the processor to process media type application/vnd.docker.distribution.manifest.list.v2+json registered 2025-07-25T11:29:52Z [INFO] [/controller/artifact/processor/processor.go:59]: the processor to process media type application/vnd.docker.distribution.manifest.v1+prettyjws registered 2025-07-25T11:29:52Z [INFO] [/controller/artifact/processor/processor.go:59]: the processor to process media type application/vnd.oci.image.config.v1+json registered 2025-07-25T11:29:52Z [INFO] [/controller/artifact/processor/processor.go:59]: the processor to process media type application/vnd.docker.container.image.v1+json registered 2025-07-25T11:29:52Z [INFO] [/controller/artifact/processor/processor.go:59]: the processor to process media type application/vnd.goharbor.harbor.sbom.v1 registered 2025-07-25T11:29:52Z [INFO] [/controller/artifact/processor/processor.go:59]: the processor to process media type application/vnd.wasm.config.v1+json registered 2025-07-25T11:29:52Z [INFO] [/pkg/reg/adapter/native/adapter.go:36]: the factory for adapter docker-registry registered 2025-07-25T11:29:52Z [INFO] [/pkg/reg/adapter/aliacr/adapter.go:40]: the factory for adapter ali-acr registered 2025-07-25T11:29:52Z [INFO] [/pkg/reg/adapter/awsecr/adapter.go:44]: the factory for adapter aws-ecr registered 2025-07-25T11:29:52Z [INFO] [/pkg/reg/adapter/azurecr/adapter.go:29]: Factory for adapter azure-acr registered 2025-07-25T11:29:52Z [INFO] [/pkg/reg/adapter/dockerhub/adapter.go:40]: Factory for adapter docker-hub registered 2025-07-25T11:29:52Z [INFO] [/pkg/reg/adapter/dtr/adapter.go:36]: the factory of dtr adapter was registered 2025-07-25T11:29:52Z [INFO] [/pkg/reg/adapter/githubcr/adapter.go:43]: the factory for adapter github-ghcr registered 2025-07-25T11:29:52Z [INFO] [/pkg/reg/adapter/gitlab/adapter.go:33]: the factory for adapter gitlab registered 2025-07-25T11:29:52Z [INFO] [/pkg/reg/adapter/googlegcr/adapter.go:37]: the factory for adapter google-gcr registered 2025-07-25T11:29:52Z [INFO] [/pkg/reg/adapter/huawei/huawei_adapter.go:40]: the factory of Huawei adapter was registered 2025-07-25T11:29:52Z [INFO] [/pkg/reg/adapter/jfrog/adapter.go:42]: the factory of jfrog artifactory adapter was registered 2025-07-25T11:29:52Z [INFO] [/pkg/reg/adapter/quay/adapter.go:53]: the factory of Quay adapter was registered 2025-07-25T11:29:52Z [INFO] [/pkg/reg/adapter/tencentcr/adapter.go:55]: the factory for adapter tencent-tcr registered 2025-07-25T11:29:52Z [INFO] [/pkg/reg/adapter/volcenginecr/adapter.go:40]: the factory for adapter volcengine-cr registered 2025-07-25T11:29:52Z [INFO] [/pkg/reg/adapter/harbor/adaper.go:31]: the factory for adapter harbor registered 2025-07-25T11:29:52Z [INFO] [/core/controllers/base.go:187]: Config path: /etc/core/app.conf 2025-07-25T11:29:52Z [INFO] [/core/main.go:148]: initializing cache ... 2025-07-25T11:29:52Z [INFO] [/core/main.go:167]: initializing configurations... 2025-07-25T11:29:52Z [INFO] [/lib/config/systemconfig.go:178]: key path: /etc/core/key 2025-07-25T11:29:52Z [INFO] [/lib/config/config.go:92]: init secret store 2025-07-25T11:29:52Z [INFO] [/core/main.go:169]: configurations initialization completed 2025-07-25T11:29:52Z [INFO] [/common/dao/base.go:67]: Registering database: type-PostgreSQL host-harbor-database port-5432 database-registry sslmode-"disable" 2025-07-25T11:29:52Z [ERROR] [/common/utils/utils.go:108]: failed to connect to tcp://harbor-database:5432, retry after 2 seconds :dial tcp 10.0.192.44:5432: connect: connection refused 2025-07-25T11:29:54Z [ERROR] [/common/utils/utils.go:108]: failed to connect to tcp://harbor-database:5432, retry after 2 seconds :dial tcp 10.0.192.44:5432: connect: connection refused 2025-07-25T11:29:56Z [ERROR] [/common/utils/utils.go:108]: failed to connect to tcp://harbor-database:5432, retry after 2 seconds :dial tcp 10.0.192.44:5432: connect: connection refused 2025-07-25T11:29:58Z [ERROR] [/common/utils/utils.go:108]: failed to connect to tcp://harbor-database:5432, retry after 2 seconds :dial tcp 10.0.192.44:5432: connect: connection refused 2025-07-25T11:30:00Z [ERROR] [/common/utils/utils.go:108]: failed to connect to tcp://harbor-database:5432, retry after 2 seconds :dial tcp 10.0.192.44:5432: connect: connection refused 2025-07-25T11:30:02Z [ERROR] [/common/utils/utils.go:108]: failed to connect to tcp://harbor-database:5432, retry after 2 seconds :dial tcp 10.0.192.44:5432: connect: connection refused 2025-07-25T11:30:04Z [ERROR] [/common/utils/utils.go:108]: failed to connect to tcp://harbor-database:5432, retry after 2 seconds :dial tcp 10.0.192.44:5432: connect: connection refused 2025-07-25T11:30:06Z [ERROR] [/common/utils/utils.go:108]: failed to connect to tcp://harbor-database:5432, retry after 2 seconds :dial tcp 10.0.192.44:5432: connect: connection refused 2025-07-25T11:30:08Z [ERROR] [/common/utils/utils.go:108]: failed to connect to tcp://harbor-database:5432, retry after 2 seconds :dial tcp 10.0.192.44:5432: connect: connection refused 2025-07-25T11:30:10Z [ERROR] [/common/utils/utils.go:108]: failed to connect to tcp://harbor-database:5432, retry after 2 seconds :dial tcp 10.0.192.44:5432: connect: connection refused 2025-07-25T11:30:12Z [ERROR] [/common/utils/utils.go:108]: failed to connect to tcp://harbor-database:5432, retry after 2 seconds :dial tcp 10.0.192.44:5432: connect: connection refused 2025-07-25T11:30:14Z [ERROR] [/common/utils/utils.go:108]: failed to connect to tcp://harbor-database:5432, retry after 2 seconds :dial tcp 10.0.192.44:5432: connect: connection refused 2025-07-25T11:30:16Z [ERROR] [/common/utils/utils.go:108]: failed to connect to tcp://harbor-database:5432, retry after 2 seconds :dial tcp 10.0.192.44:5432: connect: connection refused 2025-07-25T11:30:18Z [ERROR] [/common/utils/utils.go:108]: failed to connect to tcp://harbor-database:5432, retry after 2 seconds :dial tcp 10.0.192.44:5432: connect: connection refused 2025-07-25T11:30:20Z [ERROR] [/common/utils/utils.go:108]: failed to connect to tcp://harbor-database:5432, retry after 2 seconds :dial tcp 10.0.192.44:5432: connect: connection refused 2025-07-25T11:30:22Z [ERROR] [/common/utils/utils.go:108]: failed to connect to tcp://harbor-database:5432, retry after 2 seconds :dial tcp 10.0.192.44:5432: connect: connection refused 2025-07-25T11:30:24Z [ERROR] [/common/utils/utils.go:108]: failed to connect to tcp://harbor-database:5432, retry after 2 seconds :dial tcp 10.0.192.44:5432: connect: connection refused 2025-07-25T11:30:26Z [ERROR] [/common/utils/utils.go:108]: failed to connect to tcp://harbor-database:5432, retry after 2 seconds :dial tcp 10.0.192.44:5432: connect: connection refused 2025-07-25T11:30:28Z [ERROR] [/common/utils/utils.go:108]: failed to connect to tcp://harbor-database:5432, retry after 2 seconds :dial tcp 10.0.192.44:5432: connect: connection refused 2025-07-25T11:30:30Z [ERROR] [/common/utils/utils.go:108]: failed to connect to tcp://harbor-database:5432, retry after 2 seconds :dial tcp 10.0.192.44:5432: connect: connection refused 2025-07-25T11:30:32Z [ERROR] [/common/utils/utils.go:108]: failed to connect to tcp://harbor-database:5432, retry after 2 seconds :dial tcp 10.0.192.44:5432: connect: connection refused 2025-07-25T11:30:34Z [ERROR] [/common/utils/utils.go:108]: failed to connect to tcp://harbor-database:5432, retry after 2 seconds :dial tcp 10.0.192.44:5432: connect: connection refused 2025-07-25T11:30:36Z [ERROR] [/common/utils/utils.go:108]: failed to connect to tcp://harbor-database:5432, retry after 2 seconds :dial tcp 10.0.192.44:5432: connect: connection refused 2025-07-25T11:30:38Z [ERROR] [/common/utils/utils.go:108]: failed to connect to tcp://harbor-database:5432, retry after 2 seconds :dial tcp 10.0.192.44:5432: connect: connection refused 2025-07-25T11:30:40Z [ERROR] [/common/utils/utils.go:108]: failed to connect to tcp://harbor-database:5432, retry after 2 seconds :dial tcp 10.0.192.44:5432: connect: connection refused 2025-07-25T11:30:42Z [ERROR] [/common/utils/utils.go:108]: failed to connect to tcp://harbor-database:5432, retry after 2 seconds :dial tcp 10.0.192.44:5432: connect: connection refused 2025-07-25T11:30:44Z [ERROR] [/common/utils/utils.go:108]: failed to connect to tcp://harbor-database:5432, retry after 2 seconds :dial tcp 10.0.192.44:5432: connect: connection refused 2025-07-25T11:30:46Z [ERROR] [/common/utils/utils.go:108]: failed to connect to tcp://harbor-database:5432, retry after 2 seconds :dial tcp 10.0.192.44:5432: connect: connection refused 2025-07-25T11:30:48Z [ERROR] [/common/utils/utils.go:108]: failed to connect to tcp://harbor-database:5432, retry after 2 seconds :dial tcp 10.0.192.44:5432: connect: connection refused 2025-07-25T11:30:50Z [ERROR] [/common/utils/utils.go:108]: failed to connect to tcp://harbor-database:5432, retry after 2 seconds :dial tcp 10.0.192.44:5432: connect: connection refused 2025-07-25T11:30:52Z [FATAL] [/core/main.go:190]: failed to initialize database: failed to connect to tcp:harbor-database:5432 after 60 seconds [root@k8s-master1 harbor]# kubectl logs harbor-jobservice-6457b57477-7qgt2 -n harbor --previous Appending internal tls trust CA to ca-bundle ... find: '/etc/harbor/ssl': No such file or directory Internal tls trust CA appending is Done. 2025-07-25T11:30:54Z [INFO] [/controller/artifact/annotation/parser.go:85]: the annotation parser to parser artifact annotation version v1alpha1 registered 2025-07-25T11:30:54Z [INFO] [/controller/artifact/processor/processor.go:59]: the processor to process media type application/vnd.cncf.helm.config.v1+json registered 2025-07-25T11:30:54Z [INFO] [/controller/artifact/processor/processor.go:59]: the processor to process media type application/vnd.cnab.manifest.v1 registered 2025-07-25T11:30:54Z [INFO] [/controller/artifact/processor/processor.go:59]: the processor to process media type application/vnd.cnai.model.manifest.v1+json registered 2025-07-25T11:30:54Z [INFO] [/controller/artifact/processor/processor.go:59]: the processor to process media type application/vnd.oci.image.index.v1+json registered 2025-07-25T11:30:54Z [INFO] [/controller/artifact/processor/processor.go:59]: the processor to process media type application/vnd.docker.distribution.manifest.list.v2+json registered 2025-07-25T11:30:54Z [INFO] [/controller/artifact/processor/processor.go:59]: the processor to process media type application/vnd.docker.distribution.manifest.v1+prettyjws registered 2025-07-25T11:30:54Z [INFO] [/controller/artifact/processor/processor.go:59]: the processor to process media type application/vnd.oci.image.config.v1+json registered 2025-07-25T11:30:54Z [INFO] [/controller/artifact/processor/processor.go:59]: the processor to process media type application/vnd.docker.container.image.v1+json registered 2025-07-25T11:30:54Z [INFO] [/controller/artifact/processor/processor.go:59]: the processor to process media type application/vnd.goharbor.harbor.sbom.v1 registered 2025-07-25T11:30:54Z [INFO] [/controller/artifact/processor/processor.go:59]: the processor to process media type application/vnd.wasm.config.v1+json registered 2025-07-25T11:30:54Z [INFO] [/pkg/reg/adapter/native/adapter.go:36]: the factory for adapter docker-registry registered 2025-07-25T11:30:54Z [INFO] [/pkg/reg/adapter/aliacr/adapter.go:40]: the factory for adapter ali-acr registered 2025-07-25T11:30:54Z [INFO] [/pkg/reg/adapter/awsecr/adapter.go:44]: the factory for adapter aws-ecr registered 2025-07-25T11:30:54Z [INFO] [/pkg/reg/adapter/azurecr/adapter.go:29]: Factory for adapter azure-acr registered 2025-07-25T11:30:54Z [INFO] [/pkg/reg/adapter/dockerhub/adapter.go:40]: Factory for adapter docker-hub registered 2025-07-25T11:30:54Z [INFO] [/pkg/reg/adapter/dtr/adapter.go:36]: the factory of dtr adapter was registered 2025-07-25T11:30:54Z [INFO] [/pkg/reg/adapter/githubcr/adapter.go:43]: the factory for adapter github-ghcr registered 2025-07-25T11:30:54Z [INFO] [/pkg/reg/adapter/gitlab/adapter.go:33]: the factory for adapter gitlab registered 2025-07-25T11:30:54Z [INFO] [/pkg/reg/adapter/googlegcr/adapter.go:37]: the factory for adapter google-gcr registered 2025-07-25T11:30:54Z [INFO] [/pkg/reg/adapter/huawei/huawei_adapter.go:40]: the factory of Huawei adapter was registered 2025-07-25T11:30:54Z [INFO] [/pkg/reg/adapter/jfrog/adapter.go:42]: the factory of jfrog artifactory adapter was registered 2025-07-25T11:30:54Z [INFO] [/pkg/reg/adapter/quay/adapter.go:53]: the factory of Quay adapter was registered 2025-07-25T11:30:54Z [INFO] [/pkg/reg/adapter/tencentcr/adapter.go:55]: the factory for adapter tencent-tcr registered 2025-07-25T11:30:54Z [INFO] [/pkg/reg/adapter/volcenginecr/adapter.go:40]: the factory for adapter volcengine-cr registered 2025-07-25T11:30:54Z [INFO] [/pkg/reg/adapter/harbor/adaper.go:31]: the factory for adapter harbor registered 2025-07-25T11:30:54Z [INFO] [/pkg/config/rest/rest.go:47]: get configuration from url: http://harbor-core:80/api/v2.0/internalconfig 2025-07-25T11:30:54Z [ERROR] [/pkg/config/rest/rest.go:50]: Failed on load rest config err:Get "http://harbor-core:80/api/v2.0/internalconfig": dial tcp 10.5.232.220:80: connect: connection refused, url:http://harbor-core:80/api/v2.0/internalconfig panic: failed to load configuration, error: failed to load rest config goroutine 1 [running]: main.main() /harbor/src/jobservice/main.go:46 +0x3ae [root@k8s-master1 harbor]# 查看日子输出这些,我应该如何解决呢,目前正在用kubernetes集群搭建harbor仓库,想让root@k8s-master1 harbor]# kubectl get pod -n harbor NAME READY STATUS RESTARTS AGE harbor-core-75cd4f54b5-b5stj 0/1 CrashLoopBackOff 11 (118s ago) 26m harbor-database-0 0/1 Running 4 (3m16s ago) 26m harbor-jobservice-6457b57477-7qgt2 0/1 CrashLoopBackOff 13 (3m23s ago) 26m harbor-portal-5b6b5f7494-gcc8n 1/1 Running 1 (10m ago) 26m harbor-redis-0 1/1 Running 1 (10m ago) 26m harbor-registry-5fb967b497-d4r4r 2/2 Running 2 (10m ago) 26m harbor-trivy-0 1/1 Running 1 (10m ago) 26m全是running
07-26
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值