125 lines
5.4 KiB
Python
125 lines
5.4 KiB
Python
"""Machine-specific default values.
|
|
|
|
To allow running out-of-the-box on other machines, add code here."""
|
|
|
|
from collections import namedtuple
|
|
import os
|
|
from ctsm.joblauncher.job_launcher_factory import JOB_LAUNCHER_QSUB
|
|
from ctsm.machine import CREATE_TEST_QUEUE_UNSPECIFIED
|
|
from ctsm.machine_utils import get_user
|
|
|
|
MachineDefaults = namedtuple(
|
|
"MachineDefaults",
|
|
[
|
|
"job_launcher_type",
|
|
"scratch_dir",
|
|
"baseline_dir",
|
|
"account_required",
|
|
"create_test_retry",
|
|
"create_test_queue",
|
|
"job_launcher_defaults",
|
|
],
|
|
)
|
|
# job_launcher_type: one of the JOB_LAUNCHERs defined in job_launcher_factory
|
|
# scratch_dir: str
|
|
# baseline_dir: str: The standard location for CTSM baselines on this machine
|
|
# job_launcher_defaults: dict: keys are the JOB_LAUNCHERs defined in job_launcher_factory,
|
|
# values are types defined here (like _QsubDefaults). A given machine's defaults can
|
|
# have 0, 1 or multiple job_launcher_defaults. (It can be useful to have defaults even
|
|
# for the non-default job launcher for this machine, in case the user chooses a
|
|
# non-default launcher.)
|
|
# create_test_retry: int: Default number of times to retry a create_test job on this machine
|
|
# create_test_queue: str: Default queue to use for create_test; if this is
|
|
# CREATE_TEST_QUEUE_UNSPECIFIED, then we won't add a '--queue' option to create_test,
|
|
# instead leaving that value unspecified, allowing CIME to pick an appropriate queue
|
|
# for each test using its standard mechanisms.
|
|
# account_required: bool: whether an account number is required on this machine (not
|
|
# really a default, but used for error-checking)
|
|
|
|
# Note that the different job launcher types have different structures defining their
|
|
# defaults, because different ones require different elements to be set. For now we only
|
|
# have defaults for qsub, because other launchers (like no_batch) don't need any
|
|
# arguments.
|
|
|
|
QsubDefaults = namedtuple("QsubDefaults", ["queue", "walltime", "extra_args", "required_args"])
|
|
|
|
MACHINE_DEFAULTS = {
|
|
"cheyenne": MachineDefaults(
|
|
job_launcher_type=JOB_LAUNCHER_QSUB,
|
|
scratch_dir=os.path.join(os.path.sep, "glade", "scratch", get_user()),
|
|
baseline_dir=os.path.join(
|
|
os.path.sep, "glade", "p", "cgd", "tss", "To_Be_Safely_Deleted", "ctsm_baselines"
|
|
),
|
|
account_required=True,
|
|
create_test_retry=0,
|
|
# NOTE(wjs, 2022-02-23) By default, use the regular queue, even for
|
|
# single-processor jobs. This is because the share queue has been really flaky,
|
|
# with lots of job failures or slow-running jobs.
|
|
create_test_queue="regular",
|
|
job_launcher_defaults={
|
|
JOB_LAUNCHER_QSUB: QsubDefaults(
|
|
queue="regular",
|
|
walltime="11:50:00",
|
|
extra_args="",
|
|
# The following assumes a single node, with a single mpi proc; we may want
|
|
# to add more flexibility in the future, making the node / proc counts
|
|
# individually selectable
|
|
required_args="-l select=1:ncpus=36:mpiprocs=1 -V -r n -l inception=login -k oed",
|
|
)
|
|
},
|
|
),
|
|
"derecho": MachineDefaults(
|
|
job_launcher_type=JOB_LAUNCHER_QSUB,
|
|
scratch_dir=os.path.join(os.path.sep, "glade", "derecho", "scratch", get_user()),
|
|
baseline_dir=os.path.join(os.path.sep, "glade", "campaign", "cgd", "tss", "ctsm_baselines"),
|
|
account_required=True,
|
|
create_test_retry=0,
|
|
create_test_queue=CREATE_TEST_QUEUE_UNSPECIFIED,
|
|
job_launcher_defaults={
|
|
JOB_LAUNCHER_QSUB: QsubDefaults(
|
|
queue="main",
|
|
walltime="03:50:00",
|
|
extra_args="",
|
|
# The following assumes a single node, with a single mpi proc; we may want
|
|
# to add more flexibility in the future, making the node / proc counts
|
|
# individually selectable
|
|
required_args="-l select=1:ncpus=128:mpiprocs=1 -V -r n -k oed",
|
|
)
|
|
},
|
|
),
|
|
"hobart": MachineDefaults(
|
|
job_launcher_type=JOB_LAUNCHER_QSUB,
|
|
scratch_dir=os.path.join(os.path.sep, "scratch", "cluster", get_user()),
|
|
baseline_dir=os.path.join(os.path.sep, "fs", "cgd", "csm", "ccsm_baselines"),
|
|
account_required=False,
|
|
create_test_retry=0,
|
|
create_test_queue=CREATE_TEST_QUEUE_UNSPECIFIED,
|
|
job_launcher_defaults={
|
|
JOB_LAUNCHER_QSUB: QsubDefaults(
|
|
queue="medium",
|
|
walltime="04:00:00",
|
|
extra_args="",
|
|
required_args="-l nodes=1:ppn=48 -r n",
|
|
)
|
|
},
|
|
),
|
|
"izumi": MachineDefaults(
|
|
job_launcher_type=JOB_LAUNCHER_QSUB,
|
|
scratch_dir=os.path.join(os.path.sep, "scratch", "cluster", get_user()),
|
|
baseline_dir=os.path.join(os.path.sep, "fs", "cgd", "csm", "ccsm_baselines"),
|
|
account_required=False,
|
|
# jobs on izumi experience a high frequency of failures, often at the very end of
|
|
# the job; so we'll automatically retry a failed job twice before giving up on it
|
|
create_test_retry=2,
|
|
create_test_queue=CREATE_TEST_QUEUE_UNSPECIFIED,
|
|
job_launcher_defaults={
|
|
JOB_LAUNCHER_QSUB: QsubDefaults(
|
|
queue="medium",
|
|
walltime="04:00:00",
|
|
extra_args="",
|
|
required_args="-l nodes=1:ppn=48 -r n",
|
|
)
|
|
},
|
|
),
|
|
}
|