From 246a4c146804666efbac31e416c37fd77426e1d3 Mon Sep 17 00:00:00 2001 From: ChrisPaulBennett Date: Tue, 7 Jan 2025 11:29:55 +0000 Subject: [PATCH 001/101] Initial profiler implementation (non working) Changed the name of the profiler module. Linting Profiler sends KB instead of bytes Time Series now working CPU/Memory Logging working --- cylc/flow/cfgspec/globalcfg.py | 9 + cylc/flow/etc/job.sh | 36 +++ cylc/flow/job_file.py | 6 +- cylc/flow/scripts/profile.py | 246 ++++++++++++++++++ cylc/flow/scripts/profiler.py | 194 ++++++++++++++ setup.cfg | 2 + tests/functional/jobscript/02-profiler.t | 46 ++++ .../jobscript/02-profiler/bin/foo.sh | 2 + .../jobscript/02-profiler/flow.cylc | 77 ++++++ .../jobscript/02-profiler/reference.log | 3 + tests/unit/test_job_file.py | 8 +- 11 files changed, 626 insertions(+), 3 deletions(-) create mode 100755 cylc/flow/scripts/profile.py create mode 100755 cylc/flow/scripts/profiler.py create mode 100644 tests/functional/jobscript/02-profiler.t create mode 100755 tests/functional/jobscript/02-profiler/bin/foo.sh create mode 100644 tests/functional/jobscript/02-profiler/flow.cylc create mode 100644 tests/functional/jobscript/02-profiler/reference.log diff --git a/cylc/flow/cfgspec/globalcfg.py b/cylc/flow/cfgspec/globalcfg.py index 629f07db67d..821b3fd3d1f 100644 --- a/cylc/flow/cfgspec/globalcfg.py +++ b/cylc/flow/cfgspec/globalcfg.py @@ -1330,6 +1330,15 @@ def default_for( .. versionadded:: 8.0.0 ''') + + with Conf('profile'): + Conf('activate', VDR.V_BOOLEAN, True, desc=''' + A Boolean that sets if the cylc profiler will be used + + .. versionadded:: 8.0.0 + ''') + Conf('cgroups path', VDR.V_STRING, '/sys/fs/cgroup') + Conf('job runner', VDR.V_STRING, 'background', desc=f''' The system used to run jobs on the platform. diff --git a/cylc/flow/etc/job.sh b/cylc/flow/etc/job.sh index c64edfda298..39861aa1460 100755 --- a/cylc/flow/etc/job.sh +++ b/cylc/flow/etc/job.sh @@ -38,6 +38,8 @@ cylc__job__main() { set -x fi # Init-Script + cylc profile & + export profiler_pid="$!" & cylc__job__run_inst_func 'init_script' # Start error and vacation traps typeset signal_name= @@ -139,6 +141,12 @@ cylc__job__main() { mkdir -p "$(dirname "${CYLC_TASK_WORK_DIR}")" || true mkdir -p "${CYLC_TASK_WORK_DIR}" cd "${CYLC_TASK_WORK_DIR}" + + if [[ "${CYLC_PROFILE}" == "True" ]] ; then + cylc profile & + export profiler_pid="$!" + fi + # Env-Script, User Environment, Pre-Script, Script and Post-Script # Run user scripts in subshell to protect cylc job script from interference. # Waiting on background process allows signal traps to trigger immediately. @@ -157,12 +165,25 @@ cylc__job__main() { cylc__set_return "$ret_code" fi } + # Grab the max rss and cpu_time value before moving directory + if [[ -f "max_rss" ]]; then + max_rss=$(sed -n '1p' max_rss) + rm max_rss + fi + if [[ -f "cpu_time" ]]; then + cpu_time=$(sed -n '1p' cpu_time) + rm cpu_time + fi # Empty work directory remove cd rmdir "${CYLC_TASK_WORK_DIR}" 2>'/dev/null' || true # Send task succeeded message + cylc__kill_profiler + wait "${CYLC_TASK_MESSAGE_STARTED_PID}" 2>'/dev/null' || true + cylc message -- "${CYLC_WORKFLOW_ID}" "${CYLC_TASK_JOB}" 'succeeded' || true + cylc message -- "${CYLC_WORKFLOW_ID}" "${CYLC_TASK_JOB}" 'max_rss:' "${MAXRSS}" || true # (Ignore shellcheck "globbing and word splitting" warning here). # shellcheck disable=SC2086 trap '' ${CYLC_VACATION_SIGNALS:-} ${CYLC_FAIL_SIGNALS} @@ -187,6 +208,20 @@ cylc__set_return() { return "${1:-0}" } +############################################################################### +# Save the data using cylc message and exit the profiler +cylc__kill_profiler() { + if [[ -n ${profiler_pid:-} ]]; then + kill -s SIGINT "${profiler_pid}" + fi + if [ -n "${max_rss}" ]; then + cylc message -- "${CYLC_WORKFLOW_ID}" "${CYLC_TASK_JOB}" "DEBUG: max_rss $max_rss" || true + fi + if [ -n "${cpu_time}" ]; then + cylc message -- "${CYLC_WORKFLOW_ID}" "${CYLC_TASK_JOB}" "DEBUG: cpu_time $cpu_time" || true + fi +} + ############################################################################### # Disable selected or all (if no arguments given) fail traps. # Globals: @@ -261,6 +296,7 @@ cylc__job__run_inst_func() { # Returns: # exit ${CYLC_TASK_USER_SCRIPT_EXITCODE} cylc__job_finish_err() { + cylc__kill_profiler CYLC_TASK_USER_SCRIPT_EXITCODE="${CYLC_TASK_USER_SCRIPT_EXITCODE:-$?}" typeset signal="$1" typeset run_err_script="$2" diff --git a/cylc/flow/job_file.py b/cylc/flow/job_file.py index 930331dc5a4..489b3f99c47 100644 --- a/cylc/flow/job_file.py +++ b/cylc/flow/job_file.py @@ -224,8 +224,10 @@ def _write_task_environment(self, handle, job_conf): '\n export CYLC_TASK_TRY_NUMBER=%s' % job_conf['try_num']) handle.write( "\n export CYLC_TASK_FLOW_NUMBERS=" - f"{','.join(str(f) for f in job_conf['flow_nums'])}" - ) + f"{','.join(str(f) for f in job_conf['flow_nums'])}") + handle.write( + "\n export CYLC_PROFILE=" + f"{job_conf['platform']['profile']['activate']}") # Standard parameter environment variables for var, val in job_conf['param_var'].items(): handle.write('\n export CYLC_TASK_PARAM_%s="%s"' % (var, val)) diff --git a/cylc/flow/scripts/profile.py b/cylc/flow/scripts/profile.py new file mode 100755 index 00000000000..391dbbe9ca9 --- /dev/null +++ b/cylc/flow/scripts/profile.py @@ -0,0 +1,246 @@ +#!/usr/bin/env python3 +# THIS FILE IS PART OF THE CYLC WORKFLOW ENGINE. +# Copyright (C) NIWA & British Crown (Met Office) & Contributors. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +"""cylc profiler [OPTIONS] + +Profiler which periodically polls PBS cgroups to track +the resource usage of jobs running on the node. +""" + +import os +import sys +import time +import signal +from pathlib import Path +from dataclasses import dataclass +from argparse import ArgumentParser +from cylc.flow.terminal import cli_function +from cylc.flow.option_parsers import CylcOptionParser as COP + +INTERNAL = True + +def get_option_parser() -> COP: + parser = COP( + __doc__, + argdoc=[ + ], + ) + parser.add_option( + "-i", type=int, help="interval between query cycles in seconds", + default=10, dest="delay") + parser.add_option( + "-o", type=str, help="output directory for json file", + default=os.environ['DATADIR'], dest="output_dir") + parser.add_option( + "-m", type=str, help="Location of memory process files", + default="/sys/fs/cgroup/memory/pbspro.service/jobid", dest="memory") + + return parser + + +@cli_function(get_option_parser) +def main(parser, options): + """CLI main.""" + # Register the stop_profiler function with the signal library + signal.signal(signal.SIGINT, stop_profiler) + + profile(options) + + + +@dataclass +class Process: + """Class for representing CPU and Memory usage of a process""" + cgroup_memory_path: str + # cgroup_cpu_path: str + job_id: str + # system_usage: int + # cpu_usage: int + + +def stop_profiler(*args): + """This function will be executed when the SIGINT signal is sent + to this process""" + print('profiler exited') + sys.exit(0) + + +def parse_memory_file(process): + """Open the memory stat file and copy the appropriate data""" + path = os.path.join(process.cgroup_memory_path + "/" + + process.job_id, "memory.stat") + + for line in open(path): + key, value = line.strip().split() + # Grab the data we want + if key == 'rss': + return value + + +def write_data(process, data, output_dir, data_type): + + # Build the output file path + path = os.path.join(output_dir, process.job_id + data_type) + try: + with open("/home/h01/cbennett/repos/cylc_ui_gantt/cylc-flow/max_rss", 'w') as f: + f.write(data) + except IOError: + raise IOError("Unable to write memory data to file") + + +# def get_host_num_cpus(cpuset_path, processes): +# """Number of physical CPUs available to the process""" +# cpuset = open(cpuset_path + '/' + +# processes[0].job_id + '/cpuset.cpus').read() +# print("raw_data:", cpuset) +# cpu_number = cpuset.split('-') +# print('split:', cpu_number) +# number_of_cpus = ((int(cpu_number[1]) - int(cpu_number[0])) + 1) // 2 + # split_proc_stat_lines = [cpuset.split(') for line in proc_stat_lines] + # cpu_lines = [ + # split_line + # for split_line in split_proc_stat_lines + # if len(split_line) > 0 and "cpu" in split_line[0] + # ] + # # Number of lines starting with a word including 'cpu', subtracting + # # 1 for the first summary line. + # host_num_cpus = len(cpu_lines) - 1 + # print(number_of_cpus) + # return number_of_cpus + + +# def get_system_usage(): +# """ +# Computes total CPU usage of the host in nanoseconds. +# See also the / proc / stat entry here: +# https://man7.org/linux/man-pages/man5/proc.5.html +# """ +# # Grab usage data from proc/stat +# usage_data = open('/proc/stat').read().split("\n")[0].split()[1:8] +# +# total_clock_ticks = sum(int(entry) for entry in usage_data) +# # 100 clock ticks per second, 10^9 ns per second +# usage_ns = total_clock_ticks * 10 ** 7 +# return usage_ns +# +# +# def get_cpu_percent(num_of_cpus, proc_path, process, +# last_system_usage, last_cpu_usage): +# +# time.sleep(5) +# # Find cpuacct.usage files +# cpu_usage = int(open(process.cgroup_cpu_path + "/" + +# process.job_id + "/cpuacct.usage").read()) +# system_usage = get_system_usage() +# +# # Since deltas are not initially available, return 0.0 on first call. +# if last_system_usage is None: +# cpu_percent = 0.0 +# else: +# cpu_delta = cpu_usage - last_cpu_usage +# # "System time passed." (Typically close to clock time.) +# system_delta = (system_usage - last_system_usage) / num_of_cpus +# +# quotient = cpu_delta / system_delta +# cpu_percent = round(quotient * 100 / 8, 1) +# process.system_usage = system_usage +# process.cpu_usage = cpu_usage +# # Computed percentage might be slightly above 100%. +# return process, min(cpu_percent, 100.0) + + +def profile(args): + + # print("cylc_profile:", os.environ['CYLC_PROFILE']) + max_rss = 0 + processes = [] + # last_system_usage = None + # last_cpu_usage = None + + # Find the correct memory_stat file for the process + if not Path.exists(Path(args.memory)): + FileNotFoundError("cgroups not found") + + try: + # Find memory.stat files + for job_id in os.listdir(args.memory): + if "ex" in job_id: + print("found process:", job_id) + processes.append(Process( + cgroup_memory_path=args.memory, + # cgroup_cpu_path=args.cpu, + # system_usage=0, + # cpu_usage=0, + job_id=job_id)) + except FileNotFoundError as e: + print(e) + exit("Is this being ran on Azure HPC?") + + # cpu_count = get_host_num_cpus(args.cpuset_path, processes) + for i in range(30): + # Write memory usage data + for process in processes: + # Only save Max RSS to disk if it is above the previous value + try: + rss = int(parse_memory_file(process)) + if rss > max_rss: + max_rss = rss + write_data(process, rss, + args.output_dir, ".memory") + + except (OSError, IOError) as error: + print(error) + + # process, usage_percent = get_cpu_percent( + # cpu_count, args.proc_path, + # process, last_system_usage, last_cpu_usage) + # + # write_data(process, usage_percent, + # args.output_dir, ".cpu") + + time.sleep(args.delay) + + +def parse_arguments(): + + p = ArgumentParser( + usage="%(prog)s [options]", + description="Profiler which periodically polls PBS cgroups to track " + "the resource usage of jobs running on the node.") + + p.add_argument("-i", dest="delay", type=int, metavar="S", + default=10, help="interval between query cycles in seconds") + p.add_argument("-o", dest="output_dir", type=str, + default=os.environ['DATADIR'], + help="output directory for json file") + p.add_argument("-m", dest="memory", type=str, + default="/sys/fs/cgroup/memory/pbspro.service/jobid", + # default="/sys/fs/cgroup", + help="Location of memory process files") + # p.add_argument("-c", dest="cpu", type=str, + # default="/sys/fs/cgroup/cpu,cpuacct/pbspro.service/jobid", + # # default="/sys/fs/cgroup", + # help="Location of cpu cgroup files") + # p.add_argument("-u", dest="cpuset_path", type=str, + # default="/sys/fs/cgroup/cpuset/pbspro.service/jobid", + # help="Location of processor details") + # p.add_argument("-p", dest="proc_path", type=str, + # default="/sys/fs/cgroup/cpuset/pbspro.service/jobid", + # help="Location of processor details") + + args = p.parse_args() + + return args diff --git a/cylc/flow/scripts/profiler.py b/cylc/flow/scripts/profiler.py new file mode 100755 index 00000000000..392a66569c6 --- /dev/null +++ b/cylc/flow/scripts/profiler.py @@ -0,0 +1,194 @@ +#!/usr/bin/env python3 +# THIS FILE IS PART OF THE CYLC WORKFLOW ENGINE. +# Copyright (C) NIWA & British Crown (Met Office) & Contributors. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +"""cylc profiler [OPTIONS] + +Profiler which periodically polls PBS cgroups to track +the resource usage of jobs running on the node. +""" + +import os +import re +import sys +import time +import signal +import subprocess +from pathlib import Path +from dataclasses import dataclass +from cylc.flow.terminal import cli_function +from cylc.flow.option_parsers import CylcOptionParser as COP + +INTERNAL = True +PID_REGEX = re.compile(r"([^:]*\d{6,}.*)") +RE_INT = re.compile(r'\d+') + + +def get_option_parser() -> COP: + parser = COP( + __doc__, + argdoc=[ + ], + ) + parser.add_option( + "-i", type=int, help="interval between query cycles in seconds", + default=10, dest="delay") + parser.add_option( + "-m", type=str, help="Location of cgroups directory", + default="/sys/fs/cgroup", + dest="cgroup_location") + + return parser + + +@cli_function(get_option_parser) +def main(parser, options): + """CLI main.""" + # Register the stop_profiler function with the signal library + signal.signal(signal.SIGINT, stop_profiler) + signal.signal(signal.SIGHUP, stop_profiler) + signal.signal(signal.SIGTERM, stop_profiler) + + profile(options) + + +@dataclass +class Process: + """Class for representing CPU and Memory usage of a process""" + cgroup_memory_path: str + cgroup_cpu_path: str + + +def stop_profiler(*args): + """This function will be executed when the SIGINT signal is sent + to this process""" + print('profiler exited') + sys.exit(0) + + +def parse_memory_file(process): + """Open the memory stat file and copy the appropriate data""" + + with open(process.cgroup_memory_path, 'r') as f: + for line in f: + return int(line) // 1024 + + +def parse_cpu_file(process, cgroup_version): + """Open the memory stat file and return the appropriate data""" + + if cgroup_version == 1: + with open(process.cgroup_cpu_path, 'r') as f: + for line in f: + if "usage_usec" in line: + return int(RE_INT.findall(line)[0]) // 1000 + elif cgroup_version == 2: + with open(process.cgroup_cpu_path, 'r') as f: + for line in f: + # Cgroups v2 uses nanoseconds + return int(line) / 1000000 + else: + raise FileNotFoundError("cpu usage files not found") + + +def write_data(data, filename): + try: + with open(filename, 'w') as f: + f.write(data + "\n") + except IOError as err: + raise IOError("Unable to write data to file:" + filename) from err + + +def get_cgroup_dir(): + """Get the cgroup directory for the current process""" + # Get the PID of the current process + pid = os.getpid() + # Get the cgroup information for the current process + result = subprocess.run(['cat', '/proc/' + str(pid) + '/cgroup'], + capture_output=True, text=True, shell=False) + if result.stderr: + print(result.stderr, file=sys.stderr) + result = PID_REGEX.search(result.stdout).group() + return result + + +def profile(args): + # Find the cgroup that this process is running in. + # Cylc will put this profiler in the same cgroup + # as the job it is profiling + cgroup_name = get_cgroup_dir() + + # HPC uses cgroups v2 and SPICE uses cgroups v1 + cgroup_version = None + + if Path.exists(Path(args.cgroup_location + cgroup_name)): + cgroup_version = 1 + elif Path.exists(Path(args.cgroup_location + "/memory" + cgroup_name)): + cgroup_version = 2 + else: + raise FileNotFoundError("cgroups not found:" + cgroup_name) + + peak_memory = 0 + processes = [] + + if cgroup_version == 1: + try: + processes.append(Process( + cgroup_memory_path=args.cgroup_location + + cgroup_name + "/" + "memory.peak", + cgroup_cpu_path=args.cgroup_location + + cgroup_name + "/" + "cpu.stat")) + except FileNotFoundError as err: + print(err) + raise FileNotFoundError("cgroups not found:" + + args.cgroup_location) from err + + elif cgroup_version == 2: + try: + processes.append(Process( + cgroup_memory_path=args.cgroup_location + "/memory" + + cgroup_name + "/memory.max_usage_in_bytes", + cgroup_cpu_path=args.cgroup_location + "/cpu" + + cgroup_name + "/cpuacct.usage")) + except FileNotFoundError as err: + print(err) + raise FileNotFoundError("cgroups not found:" + + args.cgroup_location) from err + + while True: + failures = 0 + # Write memory usage data + for process in processes: + # Only save Max RSS to disk if it is above the previous value + try: + memory = parse_memory_file(process) + if memory > peak_memory: + peak_memory = memory + write_data(str(peak_memory), "max_rss") + cpu_time = parse_cpu_file(process, cgroup_version) + write_data(str(cpu_time), "cpu_time") + + except (OSError, ValueError) as error: + failures += 1 + if failures > 5: + raise OSError("cgroup polling failure", error) from error + + time.sleep(args.delay) + + +if __name__ == "__main__": + + arg_parser = get_option_parser() + profile(arg_parser.parse_args()) diff --git a/setup.cfg b/setup.cfg index abdf10fd759..3ae8e48299d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -174,11 +174,13 @@ cylc.command = kill = cylc.flow.scripts.kill:main lint = cylc.flow.scripts.lint:main list = cylc.flow.scripts.list:main + profiler = cylc.flow.scripts.profile:main message = cylc.flow.scripts.message:main pause = cylc.flow.scripts.pause:main ping = cylc.flow.scripts.ping:main play = cylc.flow.scripts.play:main poll = cylc.flow.scripts.poll:main + profile = cylc.flow.scripts.profiler:main psutils = cylc.flow.scripts.psutil:main reinstall = cylc.flow.scripts.reinstall:main release = cylc.flow.scripts.release:main diff --git a/tests/functional/jobscript/02-profiler.t b/tests/functional/jobscript/02-profiler.t new file mode 100644 index 00000000000..50e016c5ea0 --- /dev/null +++ b/tests/functional/jobscript/02-profiler.t @@ -0,0 +1,46 @@ +#!/usr/bin/env bash +# THIS FILE IS PART OF THE CYLC WORKFLOW ENGINE. +# Copyright (C) NIWA & British Crown (Met Office) & Contributors. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +#------------------------------------------------------------------------------- +# cylc profile test +. "$(dirname "$0")/test_header" +#------------------------------------------------------------------------------- +set_test_number 3 +#------------------------------------------------------------------------------- +install_workflow "${TEST_NAME_BASE}" "${TEST_NAME_BASE}" +#------------------------------------------------------------------------------- +TEST_NAME="${TEST_NAME_BASE}-validate" +run_ok "${TEST_NAME}" cylc validate "${WORKFLOW_NAME}" + +if [[ -n "${PYTHONPATH:-}" ]]; then + export PYTHONPATH="${PWD}/lib:${PYTHONPATH}" +else + export PYTHONPATH="${PWD}/lib" +fi + +export PATH_TO_CYLC_BIN="/path/to/cylc/bin" +create_test_global_config ' +[platforms] + [[profile]] + activate = true +' +#------------------------------------------------------------------------------- +TEST_NAME="${TEST_NAME_BASE}-run" +workflow_run_ok "${TEST_NAME}" cylc play --reference-test --debug --no-detach "${WORKFLOW_NAME}" + +grep_ok 'MAXRSS' "${WORKFLOW_RUN_DIR}/log/scheduler/log" + +purge diff --git a/tests/functional/jobscript/02-profiler/bin/foo.sh b/tests/functional/jobscript/02-profiler/bin/foo.sh new file mode 100755 index 00000000000..4b20577c0d0 --- /dev/null +++ b/tests/functional/jobscript/02-profiler/bin/foo.sh @@ -0,0 +1,2 @@ +#!/usr/bin/env bash +echo "Hello from $0" diff --git a/tests/functional/jobscript/02-profiler/flow.cylc b/tests/functional/jobscript/02-profiler/flow.cylc new file mode 100644 index 00000000000..951a5b94d20 --- /dev/null +++ b/tests/functional/jobscript/02-profiler/flow.cylc @@ -0,0 +1,77 @@ +[meta] + title = "job script torture test" + + description = """Any task job script may fail regardless of user runtime +settings if changes to cylc re-order the job script sections badly: e.g. +"cylc task started" must be called after the CYLC_ environment variables +are exported. Additionally, users may rely on the order of variable +definition in each environment and script section: e.g. workflow +bin directory must go in PATH before the task runtime environment is +defined because workflow bin commands could be used in variable assignment +expressions.""" + +[scheduling] + [[graph]] + R1 = "foo" +[runtime] + [[foo]] + platform = localhost + init-script = """ +echo "HELLO FROM INIT-SCRIPT" +# define a variable +export VAR_IS=is""" + pre-script = """ +echo "HELLO FROM PRE-SCRIPT" +# init-script must be done: +echo VAR_IS is $VAR_IS +# user environment must be done: +echo E_ONE is $E_ONE +echo E_TWO is $E_TWO +echo E_THR is $E_THR +echo E_FOU is $E_FOU +echo E_FIV is $E_FIV +# define a variable +export VAR_PreCS=precs""" + script = """ +echo "HELLO FROM SCRIPT" +# init-script must be done: +echo VAR_IS is $VAR_IS +# pre-script must be done: +echo VAR_PreCS is $VAR_PreCS +# environment must be done: +echo E_ONE is $E_ONE +echo E_TWO is $E_TWO +echo E_THR is $E_THR +echo E_FOU is $E_FOU +echo E_FIV is $E_FIV +# define a variable +export VAR_CS=var_cs""" + post-script = """ +echo "HELLO FROM POST-SCRIPT" +# init-script must be done: +echo VAR_IS is $VAR_IS +# pre-script must be done: +echo VAR_PreCS is $VAR_PreCS +# script must be done: +echo VAR_CS is $VAR_CS +# environment must be done: +echo E_ONE is $E_ONE +echo E_TWO is $E_TWO +echo E_THR is $E_THR +echo E_FOU is $E_FOU +echo E_FIV is $E_FIV +echo VAR_IS is $VAR_IS +echo VAR_PreCS is $VAR_PreCS +echo VAR_CS is $VAR_CS +# define a variable +export VAR_PostCS=postcs""" + [[[environment]]] + # path to cylc must be available: + E_ONE = $(( RANDOM % 10 )) + # init-script must be done: + E_TWO = $VAR_IS + # cylc-defined variables must be done: + E_THR = $CYLC_WORKFLOW_SHARE_DIR + E_FOU = $CYLC_TASK_NAME + # the workflow bin must be in $PATH already: + E_FIV = $( foo.sh ) diff --git a/tests/functional/jobscript/02-profiler/reference.log b/tests/functional/jobscript/02-profiler/reference.log new file mode 100644 index 00000000000..08fe5d5558a --- /dev/null +++ b/tests/functional/jobscript/02-profiler/reference.log @@ -0,0 +1,3 @@ +Initial point: 1 +Final point: 1 +1/foo -triggered off [] diff --git a/tests/unit/test_job_file.py b/tests/unit/test_job_file.py index 7e6b7982519..32bdcea2861 100644 --- a/tests/unit/test_job_file.py +++ b/tests/unit/test_job_file.py @@ -399,11 +399,16 @@ def test_write_task_environment(): 'CYLC_TASK_NAMESPACE_HIERARCHY="baa moo"\n export ' 'CYLC_TASK_TRY_NUMBER=1\n export ' 'CYLC_TASK_FLOW_NUMBERS=1\n export ' + 'CYLC_PROFILE=true\n export ' 'CYLC_TASK_PARAM_duck="quack"\n export ' 'CYLC_TASK_PARAM_mouse="squeak"\n ' 'CYLC_TASK_WORK_DIR_BASE=\'farm_noises/work_d\'\n}') job_conf = { - "platform": {'communication method': 'ssh'}, + "platform": {'communication method': 'ssh', + 'profile': { + "activate": "true", + } + }, "job_d": "1/moo/01", "namespace_hierarchy": ["baa", "moo"], "dependencies": ['moo', 'neigh', 'quack'], @@ -534,3 +539,4 @@ def test_homeless_platform(fixture_get_platform): job_sh_txt = job_sh.read() if 'HOME' in job_sh_txt: raise Exception('$HOME found in job.sh\n{job_sh_txt}') + From 9727be9e5df91600f6213ff14e9af4aa9ce5770e Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Mon, 24 Feb 2025 10:43:11 +0000 Subject: [PATCH 002/101] CPU/Memory Logging working Initial profiler implementation (non working) Changed the name of the profiler module. Linting Profiler sends KB instead of bytes Time Series now working CPU/Memory Logging working Adding profiler unit tests updating tests Fail gracefully if cgroups cannot be found Revert "Fail gracefully if cgroups cannot be found" This reverts commit 92e1e11c9b392b4742501d399f191f590814e95e. Linting Modifying unit tests Linting Changed the name of the profiler module. Profiler sends KB instead of bytes Time Series now working --- .github/workflows/1_create_release_pr.yml | 13 - .github/workflows/2_auto_publish_release.yml | 11 - .github/workflows/bash.yml | 8 - .github/workflows/build.yml | 34 +- .github/workflows/test_conda-build.yml | 7 - .github/workflows/test_fast.yml | 7 +- .github/workflows/test_functional.yml | 4 - .github/workflows/test_tutorial_workflow.yml | 8 - cylc/flow/cfgspec/globalcfg.py | 46 +- cylc/flow/etc/job.sh | 3 - cylc/flow/host_select.py | 11 +- cylc/flow/scripts/profile.py | 183 +++---- setup.cfg | 1 - .../43-auto-restart-force-override-normal.t | 5 +- tests/unit/scripts/test_profiler.py | 160 +++++++ tests/unit/test_subprocpool.py | 449 +++++++++--------- 16 files changed, 455 insertions(+), 495 deletions(-) create mode 100644 tests/unit/scripts/test_profiler.py diff --git a/.github/workflows/1_create_release_pr.yml b/.github/workflows/1_create_release_pr.yml index db8374e0b54..10a700b7ab4 100644 --- a/.github/workflows/1_create_release_pr.yml +++ b/.github/workflows/1_create_release_pr.yml @@ -11,19 +11,6 @@ on: required: false default: 'master' -concurrency: - # Only let this run 1 at a time - group: ${{ github.workflow }} - cancel-in-progress: false - -defaults: - run: - shell: bash - -env: - FORCE_COLOR: 2 - PIP_PROGRESS_BAR: off - jobs: create-release-pr: runs-on: ubuntu-latest diff --git a/.github/workflows/2_auto_publish_release.yml b/.github/workflows/2_auto_publish_release.yml index a2b84a46c4d..943c54090d9 100644 --- a/.github/workflows/2_auto_publish_release.yml +++ b/.github/workflows/2_auto_publish_release.yml @@ -7,18 +7,7 @@ on: # NOTE: While this is too generic, we use the `if` condition of the job to narrow it down # NOTE: Don't use `branches` as we might create release on any branch -concurrency: - # Only let this run 1 at a time - group: ${{ github.workflow }} - cancel-in-progress: false - -defaults: - run: - shell: bash - env: - FORCE_COLOR: 2 - PIP_PROGRESS_BAR: off # Best not to include the GH token here, only do it for the steps that need it MERGE_SHA: ${{ github.event.pull_request.merge_commit_sha }} CHANGELOG_FILE: CHANGES.md diff --git a/.github/workflows/bash.yml b/.github/workflows/bash.yml index d0970de9778..b7c97cd21e9 100644 --- a/.github/workflows/bash.yml +++ b/.github/workflows/bash.yml @@ -31,14 +31,6 @@ concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true -defaults: - run: - shell: bash - -env: - FORCE_COLOR: 2 - PIP_PROGRESS_BAR: off - jobs: bash-docker: runs-on: ubuntu-latest diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 6d12e5b1145..2f09c9cbf39 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -10,20 +10,6 @@ on: - 'MANIFEST.in' # check packaging - 'pyproject.toml' # check build config - 'setup.cfg' # check deps and project config - - '.gitignore' - - '.github/workflows/build.yml' - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -defaults: - run: - shell: bash -leo pipefail {0} - -env: - FORCE_COLOR: 2 - PIP_PROGRESS_BAR: off jobs: test: @@ -32,23 +18,21 @@ jobs: strategy: fail-fast: false matrix: - os: ['ubuntu-latest', 'macos-latest'] - python: ['3.7', '3.8', '3.9', '3.10', '3'] - exclude: - - os: 'macos-latest' + os: ['ubuntu-latest'] + python: ['3.8', '3.9', '3.10', '3.11'] + include: + - os: 'ubuntu-22.04' python: '3.7' + - os: 'macos-latest' + python: '3.8' steps: - name: Checkout uses: actions/checkout@v4 - name: Setup Python - uses: mamba-org/setup-micromamba@v2 + uses: actions/setup-python@v5 with: - cache-environment: true - post-cleanup: 'all' - environment-name: cylc-build - create-args: >- - python=${{ matrix.python }} + python-version: ${{ matrix.python }} - name: Build uses: cylc/release-actions/build-python-package@v1 @@ -56,7 +40,7 @@ jobs: - name: Inspect run: | unzip -l dist/*.whl | tee files - grep -E 'cylc_flow.*.dist-info/.*COPYING' files + grep 'cylc_flow.*.dist-info/COPYING' files grep 'cylc/flow/py.typed' files grep 'cylc/flow/etc' files grep 'cylc/flow/etc/cylc-completion.bash' files diff --git a/.github/workflows/test_conda-build.yml b/.github/workflows/test_conda-build.yml index 619981f2ac4..b4e97117b1a 100644 --- a/.github/workflows/test_conda-build.yml +++ b/.github/workflows/test_conda-build.yml @@ -13,13 +13,6 @@ concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true -defaults: - run: - shell: bash - -env: - FORCE_COLOR: 2 - jobs: test_conda_install: if: github.repository_owner == 'cylc' || github.event_name != 'schedule' diff --git a/.github/workflows/test_fast.yml b/.github/workflows/test_fast.yml index 95e8a5754c9..9847b71a454 100644 --- a/.github/workflows/test_fast.yml +++ b/.github/workflows/test_fast.yml @@ -16,9 +16,6 @@ defaults: run: shell: bash -c "exec $CONDA_PREFIX/bin/bash -elo pipefail {0}" -env: - PIP_PROGRESS_BAR: off - jobs: test: runs-on: ${{ matrix.os }} @@ -35,9 +32,11 @@ jobs: - os: 'ubuntu-latest' python-version: '3.9' # not the oldest, not the most recent version time-zone: 'XXX-09:35' + env: TZ: ${{ matrix.time-zone }} PYTEST_ADDOPTS: --cov --cov-append -n 5 --color=yes + steps: - name: Checkout uses: actions/checkout@v4 @@ -107,8 +106,6 @@ jobs: strategy: matrix: python-version: ['3'] - env: - FORCE_COLOR: 2 steps: - name: Checkout uses: actions/checkout@v4 diff --git a/.github/workflows/test_functional.yml b/.github/workflows/test_functional.yml index d85d3a60c9b..aa7ebb8de60 100644 --- a/.github/workflows/test_functional.yml +++ b/.github/workflows/test_functional.yml @@ -36,10 +36,6 @@ defaults: run: shell: bash -c "exec $CONDA_PREFIX/bin/bash -elo pipefail {0}" -env: - FORCE_COLOR: 2 - PIP_PROGRESS_BAR: off - jobs: test: runs-on: ${{ matrix.os }} diff --git a/.github/workflows/test_tutorial_workflow.yml b/.github/workflows/test_tutorial_workflow.yml index 01808ca4871..3faa8469ef4 100644 --- a/.github/workflows/test_tutorial_workflow.yml +++ b/.github/workflows/test_tutorial_workflow.yml @@ -17,14 +17,6 @@ concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true -defaults: - run: - shell: bash - -env: - FORCE_COLOR: 2 - PIP_PROGRESS_BAR: off - jobs: test: strategy: diff --git a/cylc/flow/cfgspec/globalcfg.py b/cylc/flow/cfgspec/globalcfg.py index 821b3fd3d1f..6d6dc23f717 100644 --- a/cylc/flow/cfgspec/globalcfg.py +++ b/cylc/flow/cfgspec/globalcfg.py @@ -826,52 +826,16 @@ def default_for( range. ''') Conf('condemned', VDR.V_ABSOLUTE_HOST_LIST, desc=f''' - List run hosts that workflows should *not* run on. + These hosts will not be used to run jobs. - These hosts will be subtracted from the - `available ` hosts - preventing new workflows from starting on the "condemned" host. - - Any workflows running on these hosts will either migrate - to another host, or shut down according to - :py:mod:`the configuration `. - - This feature requires ``auto restart`` to be listed - in `global.cylc[scheduler][main loop]plugins`. - - For more information, see the - :py:mod:`auto restart ` - plugin. - - .. rubric:: Example: - - .. code-block:: cylc - - [scheduler] - [[main loop]] - # activate the "auto restart" plugin - plugins = auto restart - [[run hosts]] - # there are three hosts in the "pool" - available = host1, host2, host3 - - # however two have been taken out: - # * workflows running on "host1" will attempt to - # restart on "host3" - # * workflows running on "host2" will shutdown - condemned = host1, host2! + If workflows are already running on + condemned hosts, Cylc will shut them down and + restart them on different hosts. .. seealso:: - :py:mod:`cylc.flow.main_loop.auto_restart` :ref:`auto-stop-restart` - .. versionchanged:: 8.4.2 - - The "force mode" (activated by a "!" suffix) caused issues - at workflow startup for Cylc versions between 8.0.0 and - 8.4.1 inclusive. - .. versionchanged:: 8.0.0 {REPLACES}``[suite servers]condemned hosts``. @@ -1381,7 +1345,7 @@ def default_for( The means by which task progress messages are reported back to the running workflow. - .. rubric:: Options: + ..rubric:: Options: zmq Direct client-server TCP communication via network ports diff --git a/cylc/flow/etc/job.sh b/cylc/flow/etc/job.sh index 39861aa1460..0d1e3275604 100755 --- a/cylc/flow/etc/job.sh +++ b/cylc/flow/etc/job.sh @@ -38,8 +38,6 @@ cylc__job__main() { set -x fi # Init-Script - cylc profile & - export profiler_pid="$!" & cylc__job__run_inst_func 'init_script' # Start error and vacation traps typeset signal_name= @@ -183,7 +181,6 @@ cylc__job__main() { wait "${CYLC_TASK_MESSAGE_STARTED_PID}" 2>'/dev/null' || true cylc message -- "${CYLC_WORKFLOW_ID}" "${CYLC_TASK_JOB}" 'succeeded' || true - cylc message -- "${CYLC_WORKFLOW_ID}" "${CYLC_TASK_JOB}" 'max_rss:' "${MAXRSS}" || true # (Ignore shellcheck "globbing and word splitting" warning here). # shellcheck disable=SC2086 trap '' ${CYLC_VACATION_SIGNALS:-} ${CYLC_FAIL_SIGNALS} diff --git a/cylc/flow/host_select.py b/cylc/flow/host_select.py index cf940864b90..69e32c68a71 100644 --- a/cylc/flow/host_select.py +++ b/cylc/flow/host_select.py @@ -128,13 +128,6 @@ def select_workflow_host(cached=True): # be returned with the up-to-date configuration. global_config = glbl_cfg(cached=cached) - # condemned hosts may be suffixed with an "!" to activate "force mode" - blacklist = [] - for host in global_config.get(['scheduler', 'run hosts', 'condemned'], []): - if host.endswith('!'): - host = host[:-1] - blacklist.append(host) - return select_host( # list of workflow hosts global_config.get([ @@ -145,7 +138,9 @@ def select_workflow_host(cached=True): 'scheduler', 'run hosts', 'ranking' ]), # list of condemned hosts - blacklist=blacklist, + blacklist=global_config.get( + ['scheduler', 'run hosts', 'condemned'] + ), blacklist_name='condemned host' ) diff --git a/cylc/flow/scripts/profile.py b/cylc/flow/scripts/profile.py index 391dbbe9ca9..2d3f1faa0ce 100755 --- a/cylc/flow/scripts/profile.py +++ b/cylc/flow/scripts/profile.py @@ -21,16 +21,20 @@ """ import os +import re import sys import time import signal +import subprocess from pathlib import Path from dataclasses import dataclass -from argparse import ArgumentParser from cylc.flow.terminal import cli_function from cylc.flow.option_parsers import CylcOptionParser as COP INTERNAL = True +PID_REGEX = re.compile(r"([^:]*\d{6,}.*)") + + def get_option_parser() -> COP: parser = COP( @@ -46,7 +50,8 @@ def get_option_parser() -> COP: default=os.environ['DATADIR'], dest="output_dir") parser.add_option( "-m", type=str, help="Location of memory process files", - default="/sys/fs/cgroup/memory/pbspro.service/jobid", dest="memory") + default="/sys/fs/cgroup", + dest="memory") return parser @@ -60,15 +65,12 @@ def main(parser, options): profile(options) - @dataclass class Process: """Class for representing CPU and Memory usage of a process""" cgroup_memory_path: str - # cgroup_cpu_path: str + cgroup_cpu_path: str job_id: str - # system_usage: int - # cpu_usage: int def stop_profiler(*args): @@ -80,128 +82,81 @@ def stop_profiler(*args): def parse_memory_file(process): """Open the memory stat file and copy the appropriate data""" - path = os.path.join(process.cgroup_memory_path + "/" + - process.job_id, "memory.stat") + memory_stats = {} - for line in open(path): - key, value = line.strip().split() - # Grab the data we want - if key == 'rss': - return value + for line in open(process.cgroup_memory_path): + return int(line) -def write_data(process, data, output_dir, data_type): +def parse_cpu_file(process): + """Open the memory stat file and copy the appropriate data""" + memory_stats = {} + + for line in open(process.cgroup_cpu_path): + if "usage_usec" in line: + return int(re.findall(r'\d+', line)[0]) + + +def write_data(process, data, output_dir, data_type, filename): # Build the output file path path = os.path.join(output_dir, process.job_id + data_type) try: - with open("/home/h01/cbennett/repos/cylc_ui_gantt/cylc-flow/max_rss", 'w') as f: - f.write(data) + with open(filename, 'w') as f: + f.write(data + "\n") except IOError: raise IOError("Unable to write memory data to file") -# def get_host_num_cpus(cpuset_path, processes): -# """Number of physical CPUs available to the process""" -# cpuset = open(cpuset_path + '/' + -# processes[0].job_id + '/cpuset.cpus').read() -# print("raw_data:", cpuset) -# cpu_number = cpuset.split('-') -# print('split:', cpu_number) -# number_of_cpus = ((int(cpu_number[1]) - int(cpu_number[0])) + 1) // 2 - # split_proc_stat_lines = [cpuset.split(') for line in proc_stat_lines] - # cpu_lines = [ - # split_line - # for split_line in split_proc_stat_lines - # if len(split_line) > 0 and "cpu" in split_line[0] - # ] - # # Number of lines starting with a word including 'cpu', subtracting - # # 1 for the first summary line. - # host_num_cpus = len(cpu_lines) - 1 - # print(number_of_cpus) - # return number_of_cpus - - -# def get_system_usage(): -# """ -# Computes total CPU usage of the host in nanoseconds. -# See also the / proc / stat entry here: -# https://man7.org/linux/man-pages/man5/proc.5.html -# """ -# # Grab usage data from proc/stat -# usage_data = open('/proc/stat').read().split("\n")[0].split()[1:8] -# -# total_clock_ticks = sum(int(entry) for entry in usage_data) -# # 100 clock ticks per second, 10^9 ns per second -# usage_ns = total_clock_ticks * 10 ** 7 -# return usage_ns -# -# -# def get_cpu_percent(num_of_cpus, proc_path, process, -# last_system_usage, last_cpu_usage): -# -# time.sleep(5) -# # Find cpuacct.usage files -# cpu_usage = int(open(process.cgroup_cpu_path + "/" + -# process.job_id + "/cpuacct.usage").read()) -# system_usage = get_system_usage() -# -# # Since deltas are not initially available, return 0.0 on first call. -# if last_system_usage is None: -# cpu_percent = 0.0 -# else: -# cpu_delta = cpu_usage - last_cpu_usage -# # "System time passed." (Typically close to clock time.) -# system_delta = (system_usage - last_system_usage) / num_of_cpus -# -# quotient = cpu_delta / system_delta -# cpu_percent = round(quotient * 100 / 8, 1) -# process.system_usage = system_usage -# process.cpu_usage = cpu_usage -# # Computed percentage might be slightly above 100%. -# return process, min(cpu_percent, 100.0) +def get_cgroup_dir(): + """Get the cgroup directory for the current process""" + # Get the PID of the current process + pid = os.getpid() + # Get the cgroup information for the current process + result = subprocess.run(['cat', '/proc/' + str(pid) + '/cgroup'], capture_output=True, text=True) + result = PID_REGEX.search(result.stdout).group() + return result def profile(args): - # print("cylc_profile:", os.environ['CYLC_PROFILE']) - max_rss = 0 + cgroup_name = get_cgroup_dir() + + # AZURE SPICE CGROUP LOCATION + cgroup_location = "/sys/fs/cgroup/" + cgroup_name + peak_memory = 0 processes = [] # last_system_usage = None # last_cpu_usage = None - # Find the correct memory_stat file for the process - if not Path.exists(Path(args.memory)): - FileNotFoundError("cgroups not found") - + if not Path.exists(Path(cgroup_location)): + raise FileNotFoundError("cgroups not found:" + cgroup_location) try: # Find memory.stat files - for job_id in os.listdir(args.memory): - if "ex" in job_id: - print("found process:", job_id) + for job_id in os.listdir(cgroup_location): + if "memory.peak" in job_id: processes.append(Process( - cgroup_memory_path=args.memory, - # cgroup_cpu_path=args.cpu, - # system_usage=0, - # cpu_usage=0, + cgroup_memory_path=cgroup_location + "/" + job_id, + cgroup_cpu_path=cgroup_location + "/" + "cpu.stat", job_id=job_id)) except FileNotFoundError as e: print(e) - exit("Is this being ran on Azure HPC?") + raise FileNotFoundError("cgroups not found:" + cgroup_location) # cpu_count = get_host_num_cpus(args.cpuset_path, processes) - for i in range(30): + while True: # Write memory usage data for process in processes: # Only save Max RSS to disk if it is above the previous value try: - rss = int(parse_memory_file(process)) - if rss > max_rss: - max_rss = rss - write_data(process, rss, - args.output_dir, ".memory") - - except (OSError, IOError) as error: + memory = parse_memory_file(process) + if memory > peak_memory: + peak_memory = memory + write_data(process, str(peak_memory), args.output_dir, ".memory", "max_rss") + cpu_time = parse_cpu_file(process) + write_data(process, str(cpu_time), args.output_dir, ".cpu", "cpu_time") + + except (OSError, IOError, ValueError) as error: print(error) # process, usage_percent = get_cpu_percent( @@ -214,33 +169,7 @@ def profile(args): time.sleep(args.delay) -def parse_arguments(): - - p = ArgumentParser( - usage="%(prog)s [options]", - description="Profiler which periodically polls PBS cgroups to track " - "the resource usage of jobs running on the node.") - - p.add_argument("-i", dest="delay", type=int, metavar="S", - default=10, help="interval between query cycles in seconds") - p.add_argument("-o", dest="output_dir", type=str, - default=os.environ['DATADIR'], - help="output directory for json file") - p.add_argument("-m", dest="memory", type=str, - default="/sys/fs/cgroup/memory/pbspro.service/jobid", - # default="/sys/fs/cgroup", - help="Location of memory process files") - # p.add_argument("-c", dest="cpu", type=str, - # default="/sys/fs/cgroup/cpu,cpuacct/pbspro.service/jobid", - # # default="/sys/fs/cgroup", - # help="Location of cpu cgroup files") - # p.add_argument("-u", dest="cpuset_path", type=str, - # default="/sys/fs/cgroup/cpuset/pbspro.service/jobid", - # help="Location of processor details") - # p.add_argument("-p", dest="proc_path", type=str, - # default="/sys/fs/cgroup/cpuset/pbspro.service/jobid", - # help="Location of processor details") - - args = p.parse_args() - - return args +if __name__ == "__main__": + + arg_parser = get_option_parser() + profile(arg_parser.parse_args()) diff --git a/setup.cfg b/setup.cfg index 3ae8e48299d..c172fdd3305 100644 --- a/setup.cfg +++ b/setup.cfg @@ -174,7 +174,6 @@ cylc.command = kill = cylc.flow.scripts.kill:main lint = cylc.flow.scripts.lint:main list = cylc.flow.scripts.list:main - profiler = cylc.flow.scripts.profile:main message = cylc.flow.scripts.message:main pause = cylc.flow.scripts.pause:main ping = cylc.flow.scripts.ping:main diff --git a/tests/functional/restart/43-auto-restart-force-override-normal.t b/tests/functional/restart/43-auto-restart-force-override-normal.t index 35edc57d1f9..b61d08c68cb 100644 --- a/tests/functional/restart/43-auto-restart-force-override-normal.t +++ b/tests/functional/restart/43-auto-restart-force-override-normal.t @@ -50,10 +50,7 @@ create_test_global_config '' " ${BASE_GLOBAL_CONFIG} [scheduler] [[run hosts]] - available = ${CYLC_TEST_HOST_1}, ${CYLC_TEST_HOST_2} - # ensure the workflow can start if a host is condemned - # in force mode see #6623 - condemned = ${CYLC_TEST_HOST_2}! + available = ${CYLC_TEST_HOST_1} " set_test_number 8 diff --git a/tests/unit/scripts/test_profiler.py b/tests/unit/scripts/test_profiler.py new file mode 100644 index 00000000000..f8560eafd9a --- /dev/null +++ b/tests/unit/scripts/test_profiler.py @@ -0,0 +1,160 @@ +# THIS FILE IS PART OF THE CYLC WORKFLOW ENGINE. +# Copyright (C) NIWA & British Crown (Met Office) & Contributors. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# Tests for functions contained in cylc.flow.scripts.profiler +from cylc.flow.scripts.profiler import (parse_memory_file, + parse_cpu_file, + write_data, + get_cgroup_name, + get_cgroup_version, + get_cgroup_paths, + stop_profiler, + profile) +import pytest +from unittest import mock + + +def test_parse_memory_file(mocker): + + with pytest.raises(FileNotFoundError): + parse_memory_file("non_existent_file.txt") + + # Mock the 'open' function call to return a file object. + mock_file = mocker.mock_open(read_data="1024") + mocker.patch("builtins.open", mock_file) + + # Test the parse_memory_file function + assert parse_memory_file("mocked_file.txt") == 1 + + # Assert that the 'open' function was called with the expected arguments. + mock_file.assert_called_once_with("mocked_file.txt", "r") + + +def test_parse_cpu_file(mocker): + with pytest.raises(FileNotFoundError): + parse_cpu_file("non_existent_file.txt", 1) + + # Mock the 'open' function call to return a file object. + mock_file = mocker.mock_open(read_data="usage_usec 1000000") + mocker.patch("builtins.open", mock_file) + + assert parse_cpu_file("mocked_file.txt", 1) == 1000 + mock_file.assert_called_once_with("mocked_file.txt", "r") + + mock_file = mocker.mock_open(read_data="1000000") + mocker.patch("builtins.open", mock_file) + assert parse_cpu_file("mocked_file.txt", 2) == 1 + mock_file.assert_called_once_with("mocked_file.txt", "r") + + +def test_write_data(tmpdir): + # Create tmp file + file = tmpdir.join('output.txt') + + write_data('test_data', file.strpath) + assert file.read() == 'test_data\n' + + +def test_get_cgroup_name(mocker): + + mock_file = mocker.mock_open(read_data="0::bad/test/cgroup/place") + mocker.patch("builtins.open", mock_file) + with pytest.raises(AttributeError): + get_cgroup_name() + + mock_file = mocker.mock_open(read_data="0::good/cgroup/place/2222222") + mocker.patch("builtins.open", mock_file) + assert get_cgroup_name() == "good/cgroup/place/2222222" + + +def test_get_cgroup_name_file_not_found(mocker): + + def mock_os_pid(): + return 'The Thing That Should Not Be' + + mocker.patch("os.getpid", mock_os_pid) + with pytest.raises(FileNotFoundError): + get_cgroup_name() + + +def test_get_cgroup_version(mocker): + + # Mock the Path.exists function call to return True + mocker.patch("pathlib.Path.exists", return_value=True) + assert get_cgroup_version('stuff/in/place', + 'more_stuff') == 1 + + with mock.patch('pathlib.Path.exists', side_effect=[False, True]): + assert get_cgroup_version('stuff/in/place', + 'more_stuff') == 2 + + # Mock the Path.exists function call to return False + mocker.patch("pathlib.Path.exists", return_value=False) + assert get_cgroup_version('stuff/in/other/place', + 'things') is None + + +def test_get_cgroup_paths(): + + process = get_cgroup_paths(1, "test_location/", + "test_name") + assert process.cgroup_memory_path == "test_location/test_name/memory.peak" + assert process.cgroup_cpu_path == "test_location/test_name/cpu.stat" + + process = get_cgroup_paths(2, "test_location", + "/test_name") + assert (process.cgroup_memory_path == + "test_location/memory/test_name/memory.max_usage_in_bytes") + assert (process.cgroup_cpu_path == + "test_location/cpu/test_name/cpuacct.usage") + + +def test_profile_cpu(mocker): + process = get_cgroup_paths(1, "test_location/", + "test_name") + + mock_file = mocker.mock_open(read_data="") + mocker.patch("builtins.open", mock_file) + mocker.patch("cylc.flow.scripts.profiler.parse_memory_file", + return_value=0) + mocker.patch("cylc.flow.scripts.profiler.parse_cpu_file", + return_value=2048) + run_once = mock.Mock(side_effect=[True, False]) + profile(process, 1, 1, run_once) + mock_file.assert_called_with("cpu_time", "w") + + +def test_profile_max_rss(mocker): + process = get_cgroup_paths(1, + "test_location/", + "test_name") + + mock_file = mocker.mock_open(read_data="") + mocker.patch("builtins.open", mock_file) + mocker.patch("cylc.flow.scripts.profiler.parse_memory_file", + return_value=1024) + mocker.patch("cylc.flow.scripts.profiler.parse_cpu_file", + return_value=2048) + run_once = mock.Mock(side_effect=[True, False]) + profile(process, 1, 1, run_once) + mock_file.assert_called_with("max_rss", "w") + + +def test_stop_profiler(): + with pytest.raises(SystemExit) as pytest_wrapped_e: + stop_profiler() + assert pytest_wrapped_e.type == SystemExit + assert pytest_wrapped_e.value.code == 0 diff --git a/tests/unit/test_subprocpool.py b/tests/unit/test_subprocpool.py index b4fe684a5e9..6b686c27082 100644 --- a/tests/unit/test_subprocpool.py +++ b/tests/unit/test_subprocpool.py @@ -14,17 +14,16 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . -from pathlib import Path -from types import SimpleNamespace from tempfile import ( - NamedTemporaryFile, - SpooledTemporaryFile, - TemporaryFile, - TemporaryDirectory, + NamedTemporaryFile, SpooledTemporaryFile, TemporaryFile, + TemporaryDirectory ) - +import unittest import pytest +from pathlib import Path +from types import SimpleNamespace + from cylc.flow import LOG from cylc.flow.id import Tokens from cylc.flow.cycling.iso8601 import ISO8601Point @@ -45,201 +44,188 @@ from cylc.flow.task_proxy import TaskProxy -def test_get_temporary_file(): - """Test SubProcPool.get_temporary_file.""" - assert isinstance(SubProcPool.get_temporary_file(), SpooledTemporaryFile) - - -def test_run_command_returns_0(): - """Test basic usage, command returns 0""" - ctx = SubProcContext('truth', ['true']) - SubProcPool.run_command(ctx) - assert ctx.err == '' - assert ctx.out == '' - assert ctx.ret_code == 0 - - -def test_run_command_returns_1(): - """Test basic usage, command returns 1""" - ctx = SubProcContext('lies', ['false']) - SubProcPool.run_command(ctx) - assert ctx.err == '' - assert ctx.out == '' - assert ctx.ret_code == 1 - - -def test_run_command_writes_to_out(): - """Test basic usage, command writes to STDOUT""" - ctx = SubProcContext('parrot', ['echo', 'pirate', 'urrrr']) - SubProcPool.run_command(ctx) - assert ctx.err == '' - assert ctx.out == 'pirate urrrr\n' - assert ctx.ret_code == 0 - - -def test_run_command_writes_to_err(): - """Test basic usage, command writes to STDERR""" - ctx = SubProcContext( - 'parrot2', - ['bash', '--noprofile', '--norc', '-c', 'echo pirate errrr >&2'] - ) - SubProcPool.run_command(ctx) - assert 'pirate errrr\n' - assert ctx.out == '' - assert ctx.ret_code == 0 - - -def test_run_command_with_stdin_from_str(): - """Test STDIN from string""" - ctx = SubProcContext('meow', ['cat'], stdin_str='catches mice.\n') - SubProcPool.run_command(ctx) - assert ctx.err == '' - assert ctx.out == 'catches mice.\n' - assert ctx.ret_code == 0 - - -def test_run_command_with_stdin_from_unicode(): - """Test STDIN from string with Unicode""" - ctx = SubProcContext('meow', ['cat'], stdin_str='喵\n') - SubProcPool.run_command(ctx) - assert ctx.err == '' - assert ctx.out == '喵\n' - assert ctx.ret_code == 0 - - -def test_run_command_with_stdin_from_handle(): - """Test STDIN from a single opened file handle""" - handle = TemporaryFile() - handle.write('catches mice.\n'.encode('UTF-8')) - handle.seek(0) - ctx = SubProcContext('meow', ['cat'], stdin_files=[handle]) - SubProcPool.run_command(ctx) - assert ctx.err == '' - assert ctx.out == 'catches mice.\n' - assert ctx.ret_code == 0 - handle.close() - - -def test_run_command_with_stdin_from_path(): - """Test STDIN from a single file path""" - handle = NamedTemporaryFile() - handle.write('catches mice.\n'.encode('UTF-8')) - handle.seek(0) - ctx = SubProcContext('meow', ['cat'], stdin_files=[handle.name]) - SubProcPool.run_command(ctx) - assert ctx.err == '' - assert ctx.out == 'catches mice.\n' - assert ctx.ret_code == 0 - handle.close() - - -def test_run_command_with_stdin_from_handles(): - """Test STDIN from multiple file handles""" - handles = [] - for txt in ['catches mice.\n', 'eat fish.\n']: +class TestSubProcPool(unittest.TestCase): + + def test_get_temporary_file(self): + """Test SubProcPool.get_temporary_file.""" + self.assertIsInstance( + SubProcPool.get_temporary_file(), SpooledTemporaryFile) + + def test_run_command_returns_0(self): + """Test basic usage, command returns 0""" + ctx = SubProcContext('truth', ['true']) + SubProcPool.run_command(ctx) + self.assertEqual(ctx.err, '') + self.assertEqual(ctx.out, '') + self.assertEqual(ctx.ret_code, 0) + + def test_run_command_returns_1(self): + """Test basic usage, command returns 1""" + ctx = SubProcContext('lies', ['false']) + SubProcPool.run_command(ctx) + self.assertEqual(ctx.err, '') + self.assertEqual(ctx.out, '') + self.assertEqual(ctx.ret_code, 1) + + def test_run_command_writes_to_out(self): + """Test basic usage, command writes to STDOUT""" + ctx = SubProcContext('parrot', ['echo', 'pirate', 'urrrr']) + SubProcPool.run_command(ctx) + self.assertEqual(ctx.err, '') + self.assertEqual(ctx.out, 'pirate urrrr\n') + self.assertEqual(ctx.ret_code, 0) + + def test_run_command_writes_to_err(self): + """Test basic usage, command writes to STDERR""" + ctx = SubProcContext( + 'parrot2', ['bash', '-c', 'echo pirate errrr >&2']) + SubProcPool.run_command(ctx) + self.assertEqual(ctx.err, 'pirate errrr\n') + self.assertEqual(ctx.out, '') + self.assertEqual(ctx.ret_code, 0) + + def test_run_command_with_stdin_from_str(self): + """Test STDIN from string""" + ctx = SubProcContext('meow', ['cat'], stdin_str='catches mice.\n') + SubProcPool.run_command(ctx) + self.assertEqual(ctx.err, '') + self.assertEqual(ctx.out, 'catches mice.\n') + self.assertEqual(ctx.ret_code, 0) + + def test_run_command_with_stdin_from_unicode(self): + """Test STDIN from string with Unicode""" + ctx = SubProcContext('meow', ['cat'], stdin_str='喵\n') + SubProcPool.run_command(ctx) + self.assertEqual(ctx.err, '') + self.assertEqual(ctx.out, '喵\n') + self.assertEqual(ctx.ret_code, 0) + + def test_run_command_with_stdin_from_handle(self): + """Test STDIN from a single opened file handle""" handle = TemporaryFile() - handle.write(txt.encode('UTF-8')) + handle.write('catches mice.\n'.encode('UTF-8')) handle.seek(0) - handles.append(handle) - ctx = SubProcContext('meow', ['cat'], stdin_files=handles) - SubProcPool.run_command(ctx) - assert ctx.err == '' - assert ctx.out == 'catches mice.\neat fish.\n' - assert ctx.ret_code == 0 - for handle in handles: + ctx = SubProcContext('meow', ['cat'], stdin_files=[handle]) + SubProcPool.run_command(ctx) + self.assertEqual(ctx.err, '') + self.assertEqual(ctx.out, 'catches mice.\n') + self.assertEqual(ctx.ret_code, 0) handle.close() - -def test_run_command_with_stdin_from_paths(): - """Test STDIN from multiple file paths""" - handles = [] - for txt in ['catches mice.\n', 'eat fish.\n']: + def test_run_command_with_stdin_from_path(self): + """Test STDIN from a single file path""" handle = NamedTemporaryFile() - handle.write(txt.encode('UTF-8')) + handle.write('catches mice.\n'.encode('UTF-8')) handle.seek(0) - handles.append(handle) - ctx = SubProcContext( - 'meow', ['cat'], stdin_files=[handle.name for handle in handles] - ) - SubProcPool.run_command(ctx) - assert ctx.err == '' - assert ctx.out == 'catches mice.\neat fish.\n' - assert ctx.ret_code == 0 - for handle in handles: + ctx = SubProcContext('meow', ['cat'], stdin_files=[handle.name]) + SubProcPool.run_command(ctx) + self.assertEqual(ctx.err, '') + self.assertEqual(ctx.out, 'catches mice.\n') + self.assertEqual(ctx.ret_code, 0) handle.close() - -def test_xfunction(): - """Test xtrigger function import.""" - with TemporaryDirectory() as temp_dir: - python_dir = Path(temp_dir, "lib", "python") - python_dir.mkdir(parents=True) - the_answer_file = python_dir / "the_answer.py" - with the_answer_file.open(mode="w") as f: - f.write("""the_answer = lambda: 42""") - f.flush() - f_name = "the_answer" - fn = get_xtrig_func(f_name, f_name, temp_dir) - result = fn() - assert 42 == result - - -def test_xfunction_cache(): - """Test xtrigger function import cache.""" - with TemporaryDirectory() as temp_dir: - python_dir = Path(temp_dir, "lib", "python") - python_dir.mkdir(parents=True) - amandita_file = python_dir / "amandita.py" - with amandita_file.open(mode="w") as f: - f.write("""choco = lambda: 'chocolate'""") - f.flush() - m_name = "amandita" # module - f_name = "choco" # function - fn = get_xtrig_func(m_name, f_name, temp_dir) - result = fn() - assert 'chocolate' == result - - # is in the cache - assert (m_name, f_name) in _XTRIG_FUNC_CACHE - # returned from cache - assert fn, get_xtrig_func(m_name, f_name == temp_dir) - - -def test_xfunction_import_error(): - """Test for error on importing a xtrigger function. - - To prevent the test eventually failing if the test function is added - and successfully imported, we use an invalid module name as per Python - spec. - """ - with TemporaryDirectory() as temp_dir, pytest.raises(ModuleNotFoundError): - get_xtrig_func("invalid-module-name", "func-name", temp_dir) - - -def test_xfunction_attribute_error(): - """Test for error on looking for an attribute in a xtrigger script.""" - with TemporaryDirectory() as temp_dir: - python_dir = Path(temp_dir, "lib", "python") - python_dir.mkdir(parents=True) - the_answer_file = python_dir / "the_sword.py" - with the_answer_file.open(mode="w") as f: - f.write("""the_droid = lambda: 'excalibur'""") - f.flush() - f_name = "the_sword" - with pytest.raises(AttributeError): - get_xtrig_func(f_name, f_name, temp_dir) + def test_run_command_with_stdin_from_handles(self): + """Test STDIN from multiple file handles""" + handles = [] + for txt in ['catches mice.\n', 'eat fish.\n']: + handle = TemporaryFile() + handle.write(txt.encode('UTF-8')) + handle.seek(0) + handles.append(handle) + ctx = SubProcContext('meow', ['cat'], stdin_files=handles) + SubProcPool.run_command(ctx) + self.assertEqual(ctx.err, '') + self.assertEqual(ctx.out, 'catches mice.\neat fish.\n') + self.assertEqual(ctx.ret_code, 0) + for handle in handles: + handle.close() + + def test_run_command_with_stdin_from_paths(self): + """Test STDIN from multiple file paths""" + handles = [] + for txt in ['catches mice.\n', 'eat fish.\n']: + handle = NamedTemporaryFile() + handle.write(txt.encode('UTF-8')) + handle.seek(0) + handles.append(handle) + ctx = SubProcContext( + 'meow', ['cat'], stdin_files=[handle.name for handle in handles]) + SubProcPool.run_command(ctx) + self.assertEqual(ctx.err, '') + self.assertEqual(ctx.out, 'catches mice.\neat fish.\n') + self.assertEqual(ctx.ret_code, 0) + for handle in handles: + handle.close() + + def test_xfunction(self): + """Test xtrigger function import.""" + with TemporaryDirectory() as temp_dir: + python_dir = Path(temp_dir, "lib", "python") + python_dir.mkdir(parents=True) + the_answer_file = python_dir / "the_answer.py" + with the_answer_file.open(mode="w") as f: + f.write("""the_answer = lambda: 42""") + f.flush() + f_name = "the_answer" + fn = get_xtrig_func(f_name, f_name, temp_dir) + result = fn() + self.assertEqual(42, result) + + def test_xfunction_cache(self): + """Test xtrigger function import cache.""" + with TemporaryDirectory() as temp_dir: + python_dir = Path(temp_dir, "lib", "python") + python_dir.mkdir(parents=True) + amandita_file = python_dir / "amandita.py" + with amandita_file.open(mode="w") as f: + f.write("""choco = lambda: 'chocolate'""") + f.flush() + m_name = "amandita" # module + f_name = "choco" # function + fn = get_xtrig_func(m_name, f_name, temp_dir) + result = fn() + self.assertEqual('chocolate', result) + + # is in the cache + self.assertTrue((m_name, f_name) in _XTRIG_FUNC_CACHE) + # returned from cache + self.assertEqual(fn, get_xtrig_func(m_name, f_name, temp_dir)) + + def test_xfunction_import_error(self): + """Test for error on importing a xtrigger function. + + To prevent the test eventually failing if the test function is added + and successfully imported, we use an invalid module name as per Python + spec. + """ + with TemporaryDirectory() as temp_dir, self.assertRaises( + ModuleNotFoundError + ): + get_xtrig_func("invalid-module-name", "func-name", temp_dir) + + def test_xfunction_attribute_error(self): + """Test for error on looking for an attribute in a xtrigger script.""" + with TemporaryDirectory() as temp_dir: + python_dir = Path(temp_dir, "lib", "python") + python_dir.mkdir(parents=True) + the_answer_file = python_dir / "the_sword.py" + with the_answer_file.open(mode="w") as f: + f.write("""the_droid = lambda: 'excalibur'""") + f.flush() + f_name = "the_sword" + with self.assertRaises(AttributeError): + get_xtrig_func(f_name, f_name, temp_dir) @pytest.fixture def mock_ctx(): def inner_(ret_code=None, host=None, cmd_key=None, cmd=None): - """Provide a SimpleNamespace which looks like a ctx object.""" + """Provide a SimpleNamespace which looks like a ctx object. + """ inputs = locals() defaults = { - 'ret_code': 255, - 'host': 'mouse', - 'cmd_key': 'my-command', - 'cmd': ['bistromathic', 'take-off'], + 'ret_code': 255, 'host': 'mouse', 'cmd_key': 'my-command', + 'cmd': ['bistromathic', 'take-off'] } for key in inputs: if inputs[key] is None: @@ -249,10 +235,9 @@ def inner_(ret_code=None, host=None, cmd_key=None, cmd=None): timestamp=None, ret_code=inputs['ret_code'], host=inputs['host'], - cmd_key=inputs['cmd_key'], + cmd_key=inputs['cmd_key'] ) return ctx - yield inner_ @@ -275,18 +260,21 @@ def _test_callback_255(ctx, foo=''): 'platform: None - Could not connect to mouse.', 255, 'ssh', - id="return 255", + id="return 255" ), pytest.param( 'platform: localhost - Could not connect to mouse.', 255, TaskJobLogsRetrieveContext(['ssh', 'something'], None, None), - id="return 255 (log-ret)", - ), - ], + id="return 255 (log-ret)" + ) + ] ) -def test__run_command_exit(caplog, mock_ctx, expect, ret_code, cmd_key): - """It runs a callback""" +def test__run_command_exit( + caplog, mock_ctx, expect, ret_code, cmd_key +): + """It runs a callback + """ ctx = mock_ctx(ret_code=ret_code, cmd_key=cmd_key, cmd=['ssh']) SubProcPool._run_command_exit( ctx, callback=_test_callback, callback_255=_test_callback_255 @@ -305,7 +293,9 @@ def test__run_command_exit_no_255_callback(caplog, mock_ctx): def test__run_command_exit_no_gettable_platform(caplog, mock_ctx): """It logs being unable to select a platform""" ret_ctx = TaskJobLogsRetrieveContext( - platform_name='rhenas', max_size=256, key='rhenas' + platform_name='rhenas', + max_size=256, + key='rhenas' ) ctx = mock_ctx(cmd_key=ret_ctx, cmd=['ssh'], ret_code=255) SubProcPool._run_command_exit(ctx, callback=_test_callback) @@ -320,19 +310,20 @@ def test__run_command_exit_no_255_args(caplog, mock_ctx): mock_ctx(cmd=['ssh', 'Zaphod']), callback=_test_callback, callback_args=['Zaphod'], - callback_255=_test_callback_255, + callback_255=_test_callback_255 ) assert '255' in caplog.records[1].msg def test__run_command_exit_add_to_badhosts(mock_ctx): - """It updates the list of badhosts""" + """It updates the list of badhosts + """ badhosts = {'foo', 'bar'} SubProcPool._run_command_exit( mock_ctx(cmd=['ssh']), bad_hosts=badhosts, callback=print, - callback_args=['Welcome to Magrathea'], + callback_args=['Welcome to Magrathea'] ) assert badhosts == {'foo', 'bar', 'mouse'} @@ -344,36 +335,32 @@ def test__run_command_exit_add_to_badhosts_log(caplog, mock_ctx): mock_ctx(cmd=['ssh']), bad_hosts=badhosts, callback=lambda x, t: print(str(x)), - callback_args=[ - TaskProxy( - Tokens('~u/w//c/t/2'), - SimpleNamespace( - name='t', - dependencies={}, - sequential='', - external_triggers=[], - xtrig_labels={}, - expiration_offset=None, - outputs={ - TASK_OUTPUT_SUBMITTED: [None, None], - TASK_OUTPUT_SUBMIT_FAILED: [None, None], - TASK_OUTPUT_SUCCEEDED: [None, None], - TASK_OUTPUT_FAILED: [None, None], - TASK_OUTPUT_EXPIRED: [None, None], - }, - graph_children={}, - rtconfig={'platform': 'foo'}, - ), - ISO8601Point('1990'), - ) - ], + callback_args=[TaskProxy( + Tokens('~u/w//c/t/2'), + SimpleNamespace( + name='t', dependencies={}, sequential='', + external_triggers=[], xtrig_labels={}, + expiration_offset=None, + outputs={ + TASK_OUTPUT_SUBMITTED: [None, None], + TASK_OUTPUT_SUBMIT_FAILED: [None, None], + TASK_OUTPUT_SUCCEEDED: [None, None], + TASK_OUTPUT_FAILED: [None, None], + TASK_OUTPUT_EXPIRED: [None, None], + }, + graph_children={}, rtconfig={'platform': 'foo'} + + ), + ISO8601Point('1990') + )] ) assert 'platform: foo' in caplog.records[0].message assert badhosts == {'foo', 'bar', 'mouse'} def test__run_command_exit_rsync_fails(mock_ctx): - """It updates the list of badhosts""" + """It updates the list of badhosts + """ badhosts = {'foo', 'bar'} ctx = mock_ctx(cmd=['rsync'], ret_code=42, cmd_key='file-install') SubProcPool._run_command_exit( @@ -384,10 +371,10 @@ def test__run_command_exit_rsync_fails(mock_ctx): { 'name': 'Magrathea', 'ssh command': 'ssh', - 'rsync command': 'rsync command', + 'rsync command': 'rsync command' }, 'Welcome to Magrathea', - ], + ] ) assert badhosts == {'foo', 'bar', 'mouse'} @@ -397,11 +384,12 @@ def test__run_command_exit_rsync_fails(mock_ctx): [ (True, {'cmd': ['ssh'], 'ret_code': 255}), (False, {'cmd': ['foo'], 'ret_code': 255}), - (False, {'cmd': ['ssh'], 'ret_code': 42}), - ], + (False, {'cmd': ['ssh'], 'ret_code': 42}) + ] ) def test_ssh_255_fail(mock_ctx, expect, ctx_kwargs): - """It knows when a ctx has failed""" + """It knows when a ctx has failed + """ output = SubProcPool.ssh_255_fail(mock_ctx(**ctx_kwargs)) assert output == expect @@ -413,10 +401,11 @@ def test_ssh_255_fail(mock_ctx, expect, ctx_kwargs): (True, {'cmd': ['rsync'], 'ret_code': 255, 'host': 'not_local'}), (False, {'cmd': ['make it-so'], 'ret_code': 255, 'host': 'not_local'}), (False, {'cmd': ['rsync'], 'ret_code': 125, 'host': 'localhost'}), - ], + ] ) def test_rsync_255_fail(mock_ctx, expect, ctx_kwargs): - """It knows when a ctx has failed""" + """It knows when a ctx has failed + """ output = SubProcPool.rsync_255_fail( mock_ctx(**ctx_kwargs), {'ssh command': 'ssh', 'rsync command': 'rsync command'}, From 171f7ee1d7126295f72f5563468f920d24aa3861 Mon Sep 17 00:00:00 2001 From: Ronnie Dutta <61982285+MetRonnie@users.noreply.github.com> Date: Wed, 12 Mar 2025 15:34:32 +0000 Subject: [PATCH 003/101] GH Actions: use explicit `bash` shell & other defaults --- .github/workflows/1_create_release_pr.yml | 13 ++++++++ .github/workflows/2_auto_publish_release.yml | 11 +++++++ .github/workflows/bash.yml | 8 +++++ .github/workflows/build.yml | 32 +++++++++++++++----- .github/workflows/test_conda-build.yml | 7 +++++ .github/workflows/test_fast.yml | 7 +++-- .github/workflows/test_functional.yml | 4 +++ .github/workflows/test_tutorial_workflow.yml | 8 +++++ 8 files changed, 80 insertions(+), 10 deletions(-) diff --git a/.github/workflows/1_create_release_pr.yml b/.github/workflows/1_create_release_pr.yml index 10a700b7ab4..db8374e0b54 100644 --- a/.github/workflows/1_create_release_pr.yml +++ b/.github/workflows/1_create_release_pr.yml @@ -11,6 +11,19 @@ on: required: false default: 'master' +concurrency: + # Only let this run 1 at a time + group: ${{ github.workflow }} + cancel-in-progress: false + +defaults: + run: + shell: bash + +env: + FORCE_COLOR: 2 + PIP_PROGRESS_BAR: off + jobs: create-release-pr: runs-on: ubuntu-latest diff --git a/.github/workflows/2_auto_publish_release.yml b/.github/workflows/2_auto_publish_release.yml index 943c54090d9..a2b84a46c4d 100644 --- a/.github/workflows/2_auto_publish_release.yml +++ b/.github/workflows/2_auto_publish_release.yml @@ -7,7 +7,18 @@ on: # NOTE: While this is too generic, we use the `if` condition of the job to narrow it down # NOTE: Don't use `branches` as we might create release on any branch +concurrency: + # Only let this run 1 at a time + group: ${{ github.workflow }} + cancel-in-progress: false + +defaults: + run: + shell: bash + env: + FORCE_COLOR: 2 + PIP_PROGRESS_BAR: off # Best not to include the GH token here, only do it for the steps that need it MERGE_SHA: ${{ github.event.pull_request.merge_commit_sha }} CHANGELOG_FILE: CHANGES.md diff --git a/.github/workflows/bash.yml b/.github/workflows/bash.yml index b7c97cd21e9..d0970de9778 100644 --- a/.github/workflows/bash.yml +++ b/.github/workflows/bash.yml @@ -31,6 +31,14 @@ concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true +defaults: + run: + shell: bash + +env: + FORCE_COLOR: 2 + PIP_PROGRESS_BAR: off + jobs: bash-docker: runs-on: ubuntu-latest diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 2f09c9cbf39..dd925b8a64f 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -10,6 +10,20 @@ on: - 'MANIFEST.in' # check packaging - 'pyproject.toml' # check build config - 'setup.cfg' # check deps and project config + - '.gitignore' + - '.github/workflows/build.yml' + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +defaults: + run: + shell: bash -leo pipefail {0} + +env: + FORCE_COLOR: 2 + PIP_PROGRESS_BAR: off jobs: test: @@ -18,21 +32,23 @@ jobs: strategy: fail-fast: false matrix: - os: ['ubuntu-latest'] - python: ['3.8', '3.9', '3.10', '3.11'] - include: - - os: 'ubuntu-22.04' - python: '3.7' + os: ['ubuntu-latest', 'macos-latest'] + python: ['3.7', '3.8', '3.9', '3.10', '3'] + exclude: - os: 'macos-latest' - python: '3.8' + python: '3.7' steps: - name: Checkout uses: actions/checkout@v4 - name: Setup Python - uses: actions/setup-python@v5 + uses: mamba-org/setup-micromamba@v2 with: - python-version: ${{ matrix.python }} + cache-environment: true + post-cleanup: 'all' + environment-name: cylc-build + create-args: >- + python=${{ matrix.python }} - name: Build uses: cylc/release-actions/build-python-package@v1 diff --git a/.github/workflows/test_conda-build.yml b/.github/workflows/test_conda-build.yml index b4e97117b1a..619981f2ac4 100644 --- a/.github/workflows/test_conda-build.yml +++ b/.github/workflows/test_conda-build.yml @@ -13,6 +13,13 @@ concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true +defaults: + run: + shell: bash + +env: + FORCE_COLOR: 2 + jobs: test_conda_install: if: github.repository_owner == 'cylc' || github.event_name != 'schedule' diff --git a/.github/workflows/test_fast.yml b/.github/workflows/test_fast.yml index 9847b71a454..95e8a5754c9 100644 --- a/.github/workflows/test_fast.yml +++ b/.github/workflows/test_fast.yml @@ -16,6 +16,9 @@ defaults: run: shell: bash -c "exec $CONDA_PREFIX/bin/bash -elo pipefail {0}" +env: + PIP_PROGRESS_BAR: off + jobs: test: runs-on: ${{ matrix.os }} @@ -32,11 +35,9 @@ jobs: - os: 'ubuntu-latest' python-version: '3.9' # not the oldest, not the most recent version time-zone: 'XXX-09:35' - env: TZ: ${{ matrix.time-zone }} PYTEST_ADDOPTS: --cov --cov-append -n 5 --color=yes - steps: - name: Checkout uses: actions/checkout@v4 @@ -106,6 +107,8 @@ jobs: strategy: matrix: python-version: ['3'] + env: + FORCE_COLOR: 2 steps: - name: Checkout uses: actions/checkout@v4 diff --git a/.github/workflows/test_functional.yml b/.github/workflows/test_functional.yml index aa7ebb8de60..d85d3a60c9b 100644 --- a/.github/workflows/test_functional.yml +++ b/.github/workflows/test_functional.yml @@ -36,6 +36,10 @@ defaults: run: shell: bash -c "exec $CONDA_PREFIX/bin/bash -elo pipefail {0}" +env: + FORCE_COLOR: 2 + PIP_PROGRESS_BAR: off + jobs: test: runs-on: ${{ matrix.os }} diff --git a/.github/workflows/test_tutorial_workflow.yml b/.github/workflows/test_tutorial_workflow.yml index 3faa8469ef4..01808ca4871 100644 --- a/.github/workflows/test_tutorial_workflow.yml +++ b/.github/workflows/test_tutorial_workflow.yml @@ -17,6 +17,14 @@ concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true +defaults: + run: + shell: bash + +env: + FORCE_COLOR: 2 + PIP_PROGRESS_BAR: off + jobs: test: strategy: From 7d50d0b48de116454da2d6edce194a2049eac8e3 Mon Sep 17 00:00:00 2001 From: Oliver Sanders Date: Wed, 19 Feb 2025 15:41:45 +0000 Subject: [PATCH 004/101] host-select: fix compatibility with force-condemned hosts --- cylc/flow/cfgspec/globalcfg.py | 46 +++++++++++++++++-- cylc/flow/host_select.py | 11 +++-- .../43-auto-restart-force-override-normal.t | 5 +- 3 files changed, 53 insertions(+), 9 deletions(-) diff --git a/cylc/flow/cfgspec/globalcfg.py b/cylc/flow/cfgspec/globalcfg.py index 6d6dc23f717..499cb29b09e 100644 --- a/cylc/flow/cfgspec/globalcfg.py +++ b/cylc/flow/cfgspec/globalcfg.py @@ -826,16 +826,52 @@ def default_for( range. ''') Conf('condemned', VDR.V_ABSOLUTE_HOST_LIST, desc=f''' - These hosts will not be used to run jobs. + List run hosts that workflows should *not* run on. - If workflows are already running on - condemned hosts, Cylc will shut them down and - restart them on different hosts. + These hosts will be subtracted from the + `available ` hosts + preventing new workflows from starting on the "condemned" host. + + Any workflows running on these hosts will either migrate + to another host, or shutdown according to + :py:mod:`the configuration `. + + This feature requires ``auto restart`` to be listed + in `global.cylc[scheduler][main loop]plugins`. + + For more information, see the + :py:mod:`auto restart ` + plugin. + + .. rubric:: Example: + + .. code-block:: cylc + + [scheduler] + [[main loop]] + # activate the "auto restart" plugin + plugins = auto restart + [[run hosts]] + # there are three hosts in the "pool" + available = host1, host2, host3 + + # however two have been taken out: + # * workflows running on "host1" will attempt to + # restart on "host3" + # * workflows running on "host2" will shutdown + condemned = host1, host2! .. seealso:: + :py:mod:`cylc.flow.main_loop.auto_restart` :ref:`auto-stop-restart` + .. versionchanged:: 8.4.2 + + The "force mode" (activated by a "!" suffix) caused issues + at workflow startup for Cylc versions between 8.0.0 and + 8.4.1 inclusive. + .. versionchanged:: 8.0.0 {REPLACES}``[suite servers]condemned hosts``. @@ -1345,7 +1381,7 @@ def default_for( The means by which task progress messages are reported back to the running workflow. - ..rubric:: Options: + .. rubric:: Options: zmq Direct client-server TCP communication via network ports diff --git a/cylc/flow/host_select.py b/cylc/flow/host_select.py index 69e32c68a71..cf940864b90 100644 --- a/cylc/flow/host_select.py +++ b/cylc/flow/host_select.py @@ -128,6 +128,13 @@ def select_workflow_host(cached=True): # be returned with the up-to-date configuration. global_config = glbl_cfg(cached=cached) + # condemned hosts may be suffixed with an "!" to activate "force mode" + blacklist = [] + for host in global_config.get(['scheduler', 'run hosts', 'condemned'], []): + if host.endswith('!'): + host = host[:-1] + blacklist.append(host) + return select_host( # list of workflow hosts global_config.get([ @@ -138,9 +145,7 @@ def select_workflow_host(cached=True): 'scheduler', 'run hosts', 'ranking' ]), # list of condemned hosts - blacklist=global_config.get( - ['scheduler', 'run hosts', 'condemned'] - ), + blacklist=blacklist, blacklist_name='condemned host' ) diff --git a/tests/functional/restart/43-auto-restart-force-override-normal.t b/tests/functional/restart/43-auto-restart-force-override-normal.t index b61d08c68cb..35edc57d1f9 100644 --- a/tests/functional/restart/43-auto-restart-force-override-normal.t +++ b/tests/functional/restart/43-auto-restart-force-override-normal.t @@ -50,7 +50,10 @@ create_test_global_config '' " ${BASE_GLOBAL_CONFIG} [scheduler] [[run hosts]] - available = ${CYLC_TEST_HOST_1} + available = ${CYLC_TEST_HOST_1}, ${CYLC_TEST_HOST_2} + # ensure the workflow can start if a host is condemned + # in force mode see #6623 + condemned = ${CYLC_TEST_HOST_2}! " set_test_number 8 From 86080886ec990441604634c0e17e395ebcdac8de Mon Sep 17 00:00:00 2001 From: Oliver Sanders Date: Tue, 18 Mar 2025 11:31:13 +0000 Subject: [PATCH 005/101] Update cylc/flow/cfgspec/globalcfg.py Co-authored-by: Hilary James Oliver --- cylc/flow/cfgspec/globalcfg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cylc/flow/cfgspec/globalcfg.py b/cylc/flow/cfgspec/globalcfg.py index 499cb29b09e..821b3fd3d1f 100644 --- a/cylc/flow/cfgspec/globalcfg.py +++ b/cylc/flow/cfgspec/globalcfg.py @@ -833,7 +833,7 @@ def default_for( preventing new workflows from starting on the "condemned" host. Any workflows running on these hosts will either migrate - to another host, or shutdown according to + to another host, or shut down according to :py:mod:`the configuration `. This feature requires ``auto restart`` to be listed From 060056ba79d667b5f4aaac5c4ed919fbcfc5b351 Mon Sep 17 00:00:00 2001 From: Oliver Sanders Date: Fri, 21 Mar 2025 10:27:34 +0000 Subject: [PATCH 006/101] actions: update build action to support python 3.13 * The COPYING file appears to have moved from `dist-info/COPYING` into `dist-info/licenses/COPYING`. --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index dd925b8a64f..6d12e5b1145 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -56,7 +56,7 @@ jobs: - name: Inspect run: | unzip -l dist/*.whl | tee files - grep 'cylc_flow.*.dist-info/COPYING' files + grep -E 'cylc_flow.*.dist-info/.*COPYING' files grep 'cylc/flow/py.typed' files grep 'cylc/flow/etc' files grep 'cylc/flow/etc/cylc-completion.bash' files From a0cdcd65d15e82637cf2821b48f6ff2e2826134a Mon Sep 17 00:00:00 2001 From: Oliver Sanders Date: Wed, 19 Feb 2025 15:41:45 +0000 Subject: [PATCH 007/101] host-select: fix compatibility with force-condemned hosts --- cylc/flow/cfgspec/globalcfg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cylc/flow/cfgspec/globalcfg.py b/cylc/flow/cfgspec/globalcfg.py index 821b3fd3d1f..499cb29b09e 100644 --- a/cylc/flow/cfgspec/globalcfg.py +++ b/cylc/flow/cfgspec/globalcfg.py @@ -833,7 +833,7 @@ def default_for( preventing new workflows from starting on the "condemned" host. Any workflows running on these hosts will either migrate - to another host, or shut down according to + to another host, or shutdown according to :py:mod:`the configuration `. This feature requires ``auto restart`` to be listed From 3d4a187955d80d7ced97c579971ce84809225d22 Mon Sep 17 00:00:00 2001 From: Oliver Sanders Date: Tue, 18 Mar 2025 11:31:13 +0000 Subject: [PATCH 008/101] Update cylc/flow/cfgspec/globalcfg.py Co-authored-by: Hilary James Oliver --- cylc/flow/cfgspec/globalcfg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cylc/flow/cfgspec/globalcfg.py b/cylc/flow/cfgspec/globalcfg.py index 499cb29b09e..821b3fd3d1f 100644 --- a/cylc/flow/cfgspec/globalcfg.py +++ b/cylc/flow/cfgspec/globalcfg.py @@ -833,7 +833,7 @@ def default_for( preventing new workflows from starting on the "condemned" host. Any workflows running on these hosts will either migrate - to another host, or shutdown according to + to another host, or shut down according to :py:mod:`the configuration `. This feature requires ``auto restart`` to be listed From 3057eddfe9f121948d490d46add61bd3bf28f917 Mon Sep 17 00:00:00 2001 From: Oliver Sanders Date: Thu, 13 Mar 2025 09:44:47 +0000 Subject: [PATCH 009/101] tests: convert unittest to pytest --- tests/unit/test_subprocpool.py | 436 +++++++++++++++++---------------- 1 file changed, 223 insertions(+), 213 deletions(-) diff --git a/tests/unit/test_subprocpool.py b/tests/unit/test_subprocpool.py index 6b686c27082..2b5ea409d68 100644 --- a/tests/unit/test_subprocpool.py +++ b/tests/unit/test_subprocpool.py @@ -15,10 +15,11 @@ # along with this program. If not, see . from tempfile import ( - NamedTemporaryFile, SpooledTemporaryFile, TemporaryFile, - TemporaryDirectory + NamedTemporaryFile, + SpooledTemporaryFile, + TemporaryFile, + TemporaryDirectory, ) -import unittest import pytest from pathlib import Path @@ -44,188 +45,199 @@ from cylc.flow.task_proxy import TaskProxy -class TestSubProcPool(unittest.TestCase): - - def test_get_temporary_file(self): - """Test SubProcPool.get_temporary_file.""" - self.assertIsInstance( - SubProcPool.get_temporary_file(), SpooledTemporaryFile) - - def test_run_command_returns_0(self): - """Test basic usage, command returns 0""" - ctx = SubProcContext('truth', ['true']) - SubProcPool.run_command(ctx) - self.assertEqual(ctx.err, '') - self.assertEqual(ctx.out, '') - self.assertEqual(ctx.ret_code, 0) - - def test_run_command_returns_1(self): - """Test basic usage, command returns 1""" - ctx = SubProcContext('lies', ['false']) - SubProcPool.run_command(ctx) - self.assertEqual(ctx.err, '') - self.assertEqual(ctx.out, '') - self.assertEqual(ctx.ret_code, 1) - - def test_run_command_writes_to_out(self): - """Test basic usage, command writes to STDOUT""" - ctx = SubProcContext('parrot', ['echo', 'pirate', 'urrrr']) - SubProcPool.run_command(ctx) - self.assertEqual(ctx.err, '') - self.assertEqual(ctx.out, 'pirate urrrr\n') - self.assertEqual(ctx.ret_code, 0) - - def test_run_command_writes_to_err(self): - """Test basic usage, command writes to STDERR""" - ctx = SubProcContext( - 'parrot2', ['bash', '-c', 'echo pirate errrr >&2']) - SubProcPool.run_command(ctx) - self.assertEqual(ctx.err, 'pirate errrr\n') - self.assertEqual(ctx.out, '') - self.assertEqual(ctx.ret_code, 0) - - def test_run_command_with_stdin_from_str(self): - """Test STDIN from string""" - ctx = SubProcContext('meow', ['cat'], stdin_str='catches mice.\n') - SubProcPool.run_command(ctx) - self.assertEqual(ctx.err, '') - self.assertEqual(ctx.out, 'catches mice.\n') - self.assertEqual(ctx.ret_code, 0) - - def test_run_command_with_stdin_from_unicode(self): - """Test STDIN from string with Unicode""" - ctx = SubProcContext('meow', ['cat'], stdin_str='喵\n') - SubProcPool.run_command(ctx) - self.assertEqual(ctx.err, '') - self.assertEqual(ctx.out, '喵\n') - self.assertEqual(ctx.ret_code, 0) - - def test_run_command_with_stdin_from_handle(self): - """Test STDIN from a single opened file handle""" +def test_get_temporary_file(): + """Test SubProcPool.get_temporary_file.""" + assert isinstance(SubProcPool.get_temporary_file(), SpooledTemporaryFile) + + +def test_run_command_returns_0(): + """Test basic usage, command returns 0""" + ctx = SubProcContext('truth', ['true']) + SubProcPool.run_command(ctx) + assert ctx.err == '' + assert ctx.out == '' + assert ctx.ret_code == 0 + + +def test_run_command_returns_1(): + """Test basic usage, command returns 1""" + ctx = SubProcContext('lies', ['false']) + SubProcPool.run_command(ctx) + assert ctx.err == '' + assert ctx.out == '' + assert ctx.ret_code == 1 + + +def test_run_command_writes_to_out(): + """Test basic usage, command writes to STDOUT""" + ctx = SubProcContext('parrot', ['echo', 'pirate', 'urrrr']) + SubProcPool.run_command(ctx) + assert ctx.err == '' + assert ctx.out == 'pirate urrrr\n' + assert ctx.ret_code == 0 + + +def test_run_command_writes_to_err(): + """Test basic usage, command writes to STDERR""" + ctx = SubProcContext('parrot2', ['bash', '-c', 'echo pirate errrr >&2']) + SubProcPool.run_command(ctx) + assert ctx.err == 'pirate errrr\n' + assert ctx.out == '' + assert ctx.ret_code == 0 + + +def test_run_command_with_stdin_from_str(): + """Test STDIN from string""" + ctx = SubProcContext('meow', ['cat'], stdin_str='catches mice.\n') + SubProcPool.run_command(ctx) + assert ctx.err == '' + assert ctx.out == 'catches mice.\n' + assert ctx.ret_code == 0 + + +def test_run_command_with_stdin_from_unicode(): + """Test STDIN from string with Unicode""" + ctx = SubProcContext('meow', ['cat'], stdin_str='喵\n') + SubProcPool.run_command(ctx) + assert ctx.err == '' + assert ctx.out == '喵\n' + assert ctx.ret_code == 0 + + +def test_run_command_with_stdin_from_handle(): + """Test STDIN from a single opened file handle""" + handle = TemporaryFile() + handle.write('catches mice.\n'.encode('UTF-8')) + handle.seek(0) + ctx = SubProcContext('meow', ['cat'], stdin_files=[handle]) + SubProcPool.run_command(ctx) + assert ctx.err == '' + assert ctx.out == 'catches mice.\n' + assert ctx.ret_code == 0 + handle.close() + + +def test_run_command_with_stdin_from_path(): + """Test STDIN from a single file path""" + handle = NamedTemporaryFile() + handle.write('catches mice.\n'.encode('UTF-8')) + handle.seek(0) + ctx = SubProcContext('meow', ['cat'], stdin_files=[handle.name]) + SubProcPool.run_command(ctx) + assert ctx.err == '' + assert ctx.out == 'catches mice.\n' + assert ctx.ret_code == 0 + handle.close() + + +def test_run_command_with_stdin_from_handles(): + """Test STDIN from multiple file handles""" + handles = [] + for txt in ['catches mice.\n', 'eat fish.\n']: handle = TemporaryFile() - handle.write('catches mice.\n'.encode('UTF-8')) + handle.write(txt.encode('UTF-8')) handle.seek(0) - ctx = SubProcContext('meow', ['cat'], stdin_files=[handle]) - SubProcPool.run_command(ctx) - self.assertEqual(ctx.err, '') - self.assertEqual(ctx.out, 'catches mice.\n') - self.assertEqual(ctx.ret_code, 0) + handles.append(handle) + ctx = SubProcContext('meow', ['cat'], stdin_files=handles) + SubProcPool.run_command(ctx) + assert ctx.err == '' + assert ctx.out == 'catches mice.\neat fish.\n' + assert ctx.ret_code == 0 + for handle in handles: handle.close() - def test_run_command_with_stdin_from_path(self): - """Test STDIN from a single file path""" + +def test_run_command_with_stdin_from_paths(): + """Test STDIN from multiple file paths""" + handles = [] + for txt in ['catches mice.\n', 'eat fish.\n']: handle = NamedTemporaryFile() - handle.write('catches mice.\n'.encode('UTF-8')) + handle.write(txt.encode('UTF-8')) handle.seek(0) - ctx = SubProcContext('meow', ['cat'], stdin_files=[handle.name]) - SubProcPool.run_command(ctx) - self.assertEqual(ctx.err, '') - self.assertEqual(ctx.out, 'catches mice.\n') - self.assertEqual(ctx.ret_code, 0) + handles.append(handle) + ctx = SubProcContext( + 'meow', ['cat'], stdin_files=[handle.name for handle in handles] + ) + SubProcPool.run_command(ctx) + assert ctx.err == '' + assert ctx.out == 'catches mice.\neat fish.\n' + assert ctx.ret_code == 0 + for handle in handles: handle.close() - def test_run_command_with_stdin_from_handles(self): - """Test STDIN from multiple file handles""" - handles = [] - for txt in ['catches mice.\n', 'eat fish.\n']: - handle = TemporaryFile() - handle.write(txt.encode('UTF-8')) - handle.seek(0) - handles.append(handle) - ctx = SubProcContext('meow', ['cat'], stdin_files=handles) - SubProcPool.run_command(ctx) - self.assertEqual(ctx.err, '') - self.assertEqual(ctx.out, 'catches mice.\neat fish.\n') - self.assertEqual(ctx.ret_code, 0) - for handle in handles: - handle.close() - - def test_run_command_with_stdin_from_paths(self): - """Test STDIN from multiple file paths""" - handles = [] - for txt in ['catches mice.\n', 'eat fish.\n']: - handle = NamedTemporaryFile() - handle.write(txt.encode('UTF-8')) - handle.seek(0) - handles.append(handle) - ctx = SubProcContext( - 'meow', ['cat'], stdin_files=[handle.name for handle in handles]) - SubProcPool.run_command(ctx) - self.assertEqual(ctx.err, '') - self.assertEqual(ctx.out, 'catches mice.\neat fish.\n') - self.assertEqual(ctx.ret_code, 0) - for handle in handles: - handle.close() - - def test_xfunction(self): - """Test xtrigger function import.""" - with TemporaryDirectory() as temp_dir: - python_dir = Path(temp_dir, "lib", "python") - python_dir.mkdir(parents=True) - the_answer_file = python_dir / "the_answer.py" - with the_answer_file.open(mode="w") as f: - f.write("""the_answer = lambda: 42""") - f.flush() - f_name = "the_answer" - fn = get_xtrig_func(f_name, f_name, temp_dir) - result = fn() - self.assertEqual(42, result) - - def test_xfunction_cache(self): - """Test xtrigger function import cache.""" - with TemporaryDirectory() as temp_dir: - python_dir = Path(temp_dir, "lib", "python") - python_dir.mkdir(parents=True) - amandita_file = python_dir / "amandita.py" - with amandita_file.open(mode="w") as f: - f.write("""choco = lambda: 'chocolate'""") - f.flush() - m_name = "amandita" # module - f_name = "choco" # function - fn = get_xtrig_func(m_name, f_name, temp_dir) - result = fn() - self.assertEqual('chocolate', result) - - # is in the cache - self.assertTrue((m_name, f_name) in _XTRIG_FUNC_CACHE) - # returned from cache - self.assertEqual(fn, get_xtrig_func(m_name, f_name, temp_dir)) - - def test_xfunction_import_error(self): - """Test for error on importing a xtrigger function. - - To prevent the test eventually failing if the test function is added - and successfully imported, we use an invalid module name as per Python - spec. - """ - with TemporaryDirectory() as temp_dir, self.assertRaises( - ModuleNotFoundError - ): + +def test_xfunction(): + """Test xtrigger function import.""" + with TemporaryDirectory() as temp_dir: + python_dir = Path(temp_dir, "lib", "python") + python_dir.mkdir(parents=True) + the_answer_file = python_dir / "the_answer.py" + with the_answer_file.open(mode="w") as f: + f.write("""the_answer = lambda: 42""") + f.flush() + f_name = "the_answer" + fn = get_xtrig_func(f_name, f_name, temp_dir) + result = fn() + assert 42 == result + + +def test_xfunction_cache(): + """Test xtrigger function import cache.""" + with TemporaryDirectory() as temp_dir: + python_dir = Path(temp_dir, "lib", "python") + python_dir.mkdir(parents=True) + amandita_file = python_dir / "amandita.py" + with amandita_file.open(mode="w") as f: + f.write("""choco = lambda: 'chocolate'""") + f.flush() + m_name = "amandita" # module + f_name = "choco" # function + fn = get_xtrig_func(m_name, f_name, temp_dir) + result = fn() + assert 'chocolate' == result + + # is in the cache + assert (m_name, f_name) in _XTRIG_FUNC_CACHE + # returned from cache + assert fn, get_xtrig_func(m_name, f_name == temp_dir) + + +def test_xfunction_import_error(): + """Test for error on importing a xtrigger function. + + To prevent the test eventually failing if the test function is added + and successfully imported, we use an invalid module name as per Python + spec. + """ + with TemporaryDirectory() as temp_dir: + with pytest.raises(ModuleNotFoundError): get_xtrig_func("invalid-module-name", "func-name", temp_dir) - def test_xfunction_attribute_error(self): - """Test for error on looking for an attribute in a xtrigger script.""" - with TemporaryDirectory() as temp_dir: - python_dir = Path(temp_dir, "lib", "python") - python_dir.mkdir(parents=True) - the_answer_file = python_dir / "the_sword.py" - with the_answer_file.open(mode="w") as f: - f.write("""the_droid = lambda: 'excalibur'""") - f.flush() - f_name = "the_sword" - with self.assertRaises(AttributeError): - get_xtrig_func(f_name, f_name, temp_dir) + +def test_xfunction_attribute_error(): + """Test for error on looking for an attribute in a xtrigger script.""" + with TemporaryDirectory() as temp_dir: + python_dir = Path(temp_dir, "lib", "python") + python_dir.mkdir(parents=True) + the_answer_file = python_dir / "the_sword.py" + with the_answer_file.open(mode="w") as f: + f.write("""the_droid = lambda: 'excalibur'""") + f.flush() + f_name = "the_sword" + with pytest.raises(AttributeError): + get_xtrig_func(f_name, f_name, temp_dir) @pytest.fixture def mock_ctx(): def inner_(ret_code=None, host=None, cmd_key=None, cmd=None): - """Provide a SimpleNamespace which looks like a ctx object. - """ + """Provide a SimpleNamespace which looks like a ctx object.""" inputs = locals() defaults = { - 'ret_code': 255, 'host': 'mouse', 'cmd_key': 'my-command', - 'cmd': ['bistromathic', 'take-off'] + 'ret_code': 255, + 'host': 'mouse', + 'cmd_key': 'my-command', + 'cmd': ['bistromathic', 'take-off'], } for key in inputs: if inputs[key] is None: @@ -235,9 +247,10 @@ def inner_(ret_code=None, host=None, cmd_key=None, cmd=None): timestamp=None, ret_code=inputs['ret_code'], host=inputs['host'], - cmd_key=inputs['cmd_key'] + cmd_key=inputs['cmd_key'], ) return ctx + yield inner_ @@ -260,21 +273,19 @@ def _test_callback_255(ctx, foo=''): 'platform: None - Could not connect to mouse.', 255, 'ssh', - id="return 255" + id="return 255", ), pytest.param( 'platform: localhost - Could not connect to mouse.', 255, TaskJobLogsRetrieveContext(['ssh', 'something'], None, None), - id="return 255 (log-ret)" - ) - ] + id="return 255 (log-ret)", + ), + ], ) -def test__run_command_exit( - caplog, mock_ctx, expect, ret_code, cmd_key +def test__run_command_exit(caplog, mock_ctx, expect, ret_code, cmd_key ): - """It runs a callback - """ + """It runs a callback""" ctx = mock_ctx(ret_code=ret_code, cmd_key=cmd_key, cmd=['ssh']) SubProcPool._run_command_exit( ctx, callback=_test_callback, callback_255=_test_callback_255 @@ -293,9 +304,7 @@ def test__run_command_exit_no_255_callback(caplog, mock_ctx): def test__run_command_exit_no_gettable_platform(caplog, mock_ctx): """It logs being unable to select a platform""" ret_ctx = TaskJobLogsRetrieveContext( - platform_name='rhenas', - max_size=256, - key='rhenas' + platform_name='rhenas', max_size=256, key='rhenas' ) ctx = mock_ctx(cmd_key=ret_ctx, cmd=['ssh'], ret_code=255) SubProcPool._run_command_exit(ctx, callback=_test_callback) @@ -310,20 +319,19 @@ def test__run_command_exit_no_255_args(caplog, mock_ctx): mock_ctx(cmd=['ssh', 'Zaphod']), callback=_test_callback, callback_args=['Zaphod'], - callback_255=_test_callback_255 + callback_255=_test_callback_255, ) assert '255' in caplog.records[1].msg def test__run_command_exit_add_to_badhosts(mock_ctx): - """It updates the list of badhosts - """ + """It updates the list of badhosts""" badhosts = {'foo', 'bar'} SubProcPool._run_command_exit( mock_ctx(cmd=['ssh']), bad_hosts=badhosts, callback=print, - callback_args=['Welcome to Magrathea'] + callback_args=['Welcome to Magrathea'], ) assert badhosts == {'foo', 'bar', 'mouse'} @@ -335,32 +343,36 @@ def test__run_command_exit_add_to_badhosts_log(caplog, mock_ctx): mock_ctx(cmd=['ssh']), bad_hosts=badhosts, callback=lambda x, t: print(str(x)), - callback_args=[TaskProxy( - Tokens('~u/w//c/t/2'), - SimpleNamespace( - name='t', dependencies={}, sequential='', - external_triggers=[], xtrig_labels={}, - expiration_offset=None, - outputs={ - TASK_OUTPUT_SUBMITTED: [None, None], - TASK_OUTPUT_SUBMIT_FAILED: [None, None], - TASK_OUTPUT_SUCCEEDED: [None, None], - TASK_OUTPUT_FAILED: [None, None], - TASK_OUTPUT_EXPIRED: [None, None], - }, - graph_children={}, rtconfig={'platform': 'foo'} - - ), - ISO8601Point('1990') - )] + callback_args=[ + TaskProxy( + Tokens('~u/w//c/t/2'), + SimpleNamespace( + name='t', + dependencies={}, + sequential='', + external_triggers=[], + xtrig_labels={}, + expiration_offset=None, + outputs={ + TASK_OUTPUT_SUBMITTED: [None, None], + TASK_OUTPUT_SUBMIT_FAILED: [None, None], + TASK_OUTPUT_SUCCEEDED: [None, None], + TASK_OUTPUT_FAILED: [None, None], + TASK_OUTPUT_EXPIRED: [None, None], + }, + graph_children={}, + rtconfig={'platform': 'foo'}, + ), + ISO8601Point('1990'), + ) + ], ) assert 'platform: foo' in caplog.records[0].message assert badhosts == {'foo', 'bar', 'mouse'} def test__run_command_exit_rsync_fails(mock_ctx): - """It updates the list of badhosts - """ + """It updates the list of badhosts""" badhosts = {'foo', 'bar'} ctx = mock_ctx(cmd=['rsync'], ret_code=42, cmd_key='file-install') SubProcPool._run_command_exit( @@ -371,10 +383,10 @@ def test__run_command_exit_rsync_fails(mock_ctx): { 'name': 'Magrathea', 'ssh command': 'ssh', - 'rsync command': 'rsync command' + 'rsync command': 'rsync command', }, 'Welcome to Magrathea', - ] + ], ) assert badhosts == {'foo', 'bar', 'mouse'} @@ -384,12 +396,11 @@ def test__run_command_exit_rsync_fails(mock_ctx): [ (True, {'cmd': ['ssh'], 'ret_code': 255}), (False, {'cmd': ['foo'], 'ret_code': 255}), - (False, {'cmd': ['ssh'], 'ret_code': 42}) - ] + (False, {'cmd': ['ssh'], 'ret_code': 42}), + ], ) def test_ssh_255_fail(mock_ctx, expect, ctx_kwargs): - """It knows when a ctx has failed - """ + """It knows when a ctx has failed""" output = SubProcPool.ssh_255_fail(mock_ctx(**ctx_kwargs)) assert output == expect @@ -401,11 +412,10 @@ def test_ssh_255_fail(mock_ctx, expect, ctx_kwargs): (True, {'cmd': ['rsync'], 'ret_code': 255, 'host': 'not_local'}), (False, {'cmd': ['make it-so'], 'ret_code': 255, 'host': 'not_local'}), (False, {'cmd': ['rsync'], 'ret_code': 125, 'host': 'localhost'}), - ] + ], ) def test_rsync_255_fail(mock_ctx, expect, ctx_kwargs): - """It knows when a ctx has failed - """ + """It knows when a ctx has failed""" output = SubProcPool.rsync_255_fail( mock_ctx(**ctx_kwargs), {'ssh command': 'ssh', 'rsync command': 'rsync command'}, From a6ecd2939df894b8599fda0ca2cf127b7a23910f Mon Sep 17 00:00:00 2001 From: Oliver Sanders Date: Thu, 13 Mar 2025 09:53:36 +0000 Subject: [PATCH 010/101] tests/u: test_subprocpool.py::test_run_command_writes_to_err * Attempt to fix flaky test. * Cut out shell profile files to omit some spurious stderr. --- tests/unit/test_subprocpool.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/tests/unit/test_subprocpool.py b/tests/unit/test_subprocpool.py index 2b5ea409d68..1f876fd83ef 100644 --- a/tests/unit/test_subprocpool.py +++ b/tests/unit/test_subprocpool.py @@ -14,16 +14,16 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . +from pathlib import Path +from types import SimpleNamespace from tempfile import ( NamedTemporaryFile, SpooledTemporaryFile, TemporaryFile, TemporaryDirectory, ) -import pytest -from pathlib import Path -from types import SimpleNamespace +import pytest from cylc.flow import LOG from cylc.flow.id import Tokens @@ -79,9 +79,12 @@ def test_run_command_writes_to_out(): def test_run_command_writes_to_err(): """Test basic usage, command writes to STDERR""" - ctx = SubProcContext('parrot2', ['bash', '-c', 'echo pirate errrr >&2']) + ctx = SubProcContext( + 'parrot2', + ['bash', '--noprofile', '--norc', '-c', 'echo pirate errrr >&2'] + ) SubProcPool.run_command(ctx) - assert ctx.err == 'pirate errrr\n' + assert 'pirate errrr\n' assert ctx.out == '' assert ctx.ret_code == 0 From e5119b4324862404695e526bc7c988dc855f07a2 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Tue, 4 Mar 2025 10:50:38 +0000 Subject: [PATCH 011/101] Time Series now working --- cylc/flow/scripts/profile.py | 98 ++++++++++++++++++------------------ 1 file changed, 50 insertions(+), 48 deletions(-) diff --git a/cylc/flow/scripts/profile.py b/cylc/flow/scripts/profile.py index 2d3f1faa0ce..1bd7912903f 100755 --- a/cylc/flow/scripts/profile.py +++ b/cylc/flow/scripts/profile.py @@ -35,7 +35,6 @@ PID_REGEX = re.compile(r"([^:]*\d{6,}.*)") - def get_option_parser() -> COP: parser = COP( __doc__, @@ -46,12 +45,9 @@ def get_option_parser() -> COP: "-i", type=int, help="interval between query cycles in seconds", default=10, dest="delay") parser.add_option( - "-o", type=str, help="output directory for json file", - default=os.environ['DATADIR'], dest="output_dir") - parser.add_option( - "-m", type=str, help="Location of memory process files", + "-m", type=str, help="Location of cgroups directory", default="/sys/fs/cgroup", - dest="memory") + dest="cgroup_location") return parser @@ -70,7 +66,6 @@ class Process: """Class for representing CPU and Memory usage of a process""" cgroup_memory_path: str cgroup_cpu_path: str - job_id: str def stop_profiler(*args): @@ -88,24 +83,28 @@ def parse_memory_file(process): return int(line) -def parse_cpu_file(process): - """Open the memory stat file and copy the appropriate data""" +def parse_cpu_file(process, cgroup_version): + """Open the memory stat file and return the appropriate data""" memory_stats = {} - for line in open(process.cgroup_cpu_path): - if "usage_usec" in line: - return int(re.findall(r'\d+', line)[0]) + if cgroup_version == 1: + for line in open(process.cgroup_cpu_path): + if "usage_usec" in line: + return int(re.findall(r'\d+', line)[0]) / 1000 + elif cgroup_version == 2: + for line in open(process.cgroup_cpu_path): + # Cgroups v2 uses nanoseconds + return int(line) / 1000000 + else: + raise FileNotFoundError("cpu usage files not found") -def write_data(process, data, output_dir, data_type, filename): - - # Build the output file path - path = os.path.join(output_dir, process.job_id + data_type) +def write_data(data, filename): try: with open(filename, 'w') as f: f.write(data + "\n") except IOError: - raise IOError("Unable to write memory data to file") + raise IOError("Unable to write data to file:" + filename) def get_cgroup_dir(): @@ -119,31 +118,41 @@ def get_cgroup_dir(): def profile(args): - + # Find the cgroup that this process is running in. + # Cylc will put this profiler in the same cgroup as the job it is profiling cgroup_name = get_cgroup_dir() - # AZURE SPICE CGROUP LOCATION - cgroup_location = "/sys/fs/cgroup/" + cgroup_name + # HPC uses cgroups v2 and SPICE uses cgroups v1 + cgroup_version = None + + if Path.exists(Path(args.cgroup_location + cgroup_name)): + cgroup_version = 1 + elif Path.exists(Path(args.cgroup_location + "/memory" + cgroup_name)): + cgroup_version = 2 + else: + raise FileNotFoundError("cgroups not found:" + cgroup_name) + peak_memory = 0 processes = [] - # last_system_usage = None - # last_cpu_usage = None - # Find the correct memory_stat file for the process - if not Path.exists(Path(cgroup_location)): - raise FileNotFoundError("cgroups not found:" + cgroup_location) - try: - # Find memory.stat files - for job_id in os.listdir(cgroup_location): - if "memory.peak" in job_id: - processes.append(Process( - cgroup_memory_path=cgroup_location + "/" + job_id, - cgroup_cpu_path=cgroup_location + "/" + "cpu.stat", - job_id=job_id)) - except FileNotFoundError as e: - print(e) - raise FileNotFoundError("cgroups not found:" + cgroup_location) - - # cpu_count = get_host_num_cpus(args.cpuset_path, processes) + + if cgroup_version == 1: + try: + processes.append(Process( + cgroup_memory_path=args.cgroup_location + cgroup_name + "/" + "memory.peak", + cgroup_cpu_path=args.cgroup_location + cgroup_name + "/" + "cpu.stat")) + except FileNotFoundError as e: + print(e) + raise FileNotFoundError("cgroups not found:" + args.cgroup_location) + + elif cgroup_version == 2: + try: + processes.append(Process( + cgroup_memory_path=args.cgroup_location + "/memory" + cgroup_name + "/memory.max_usage_in_bytes", + cgroup_cpu_path=args.cgroup_location + "/cpu" + cgroup_name + "/cpuacct.usage")) + except FileNotFoundError as e: + print(e) + raise FileNotFoundError("cgroups not found:" + args.cgroup_location) + while True: # Write memory usage data for process in processes: @@ -152,20 +161,13 @@ def profile(args): memory = parse_memory_file(process) if memory > peak_memory: peak_memory = memory - write_data(process, str(peak_memory), args.output_dir, ".memory", "max_rss") - cpu_time = parse_cpu_file(process) - write_data(process, str(cpu_time), args.output_dir, ".cpu", "cpu_time") + write_data(str(peak_memory), "max_rss") + cpu_time = parse_cpu_file(process, cgroup_version) + write_data(str(cpu_time), "cpu_time") except (OSError, IOError, ValueError) as error: print(error) - # process, usage_percent = get_cpu_percent( - # cpu_count, args.proc_path, - # process, last_system_usage, last_cpu_usage) - # - # write_data(process, usage_percent, - # args.output_dir, ".cpu") - time.sleep(args.delay) From 9f593fc562fd3df15cc6c1354be89a80f72eef79 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Wed, 12 Mar 2025 08:51:49 +0000 Subject: [PATCH 012/101] Profiler sends KB instead of bytes --- cylc/flow/scripts/profile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cylc/flow/scripts/profile.py b/cylc/flow/scripts/profile.py index 1bd7912903f..8a7a1f235e3 100755 --- a/cylc/flow/scripts/profile.py +++ b/cylc/flow/scripts/profile.py @@ -80,7 +80,7 @@ def parse_memory_file(process): memory_stats = {} for line in open(process.cgroup_memory_path): - return int(line) + return int(line) // 1024 def parse_cpu_file(process, cgroup_version): From 5f84c2f4b3a3623ecf1b85cef3a42d5fe0d5b776 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Fri, 14 Mar 2025 15:27:39 +0000 Subject: [PATCH 013/101] Modifying unit tests --- cylc/flow/etc/job.sh | 11 +- cylc/flow/scripts/profile.py | 177 ----------------------- cylc/flow/scripts/profiler.py | 154 +++++++++----------- tests/functional/jobscript/02-profiler.t | 7 +- tests/unit/test_job_file.py | 14 +- 5 files changed, 88 insertions(+), 275 deletions(-) delete mode 100755 cylc/flow/scripts/profile.py diff --git a/cylc/flow/etc/job.sh b/cylc/flow/etc/job.sh index 0d1e3275604..c0837bfeba6 100755 --- a/cylc/flow/etc/job.sh +++ b/cylc/flow/etc/job.sh @@ -208,14 +208,14 @@ cylc__set_return() { ############################################################################### # Save the data using cylc message and exit the profiler cylc__kill_profiler() { - if [[ -n ${profiler_pid:-} ]]; then - kill -s SIGINT "${profiler_pid}" + if [[ -n "${cpu_time:-}" ]]; then + cylc message -- "${CYLC_WORKFLOW_ID}" "${CYLC_TASK_JOB}" "DEBUG: cpu_time $cpu_time" || true fi - if [ -n "${max_rss}" ]; then + if [[ -n "${max_rss:-}" ]]; then cylc message -- "${CYLC_WORKFLOW_ID}" "${CYLC_TASK_JOB}" "DEBUG: max_rss $max_rss" || true fi - if [ -n "${cpu_time}" ]; then - cylc message -- "${CYLC_WORKFLOW_ID}" "${CYLC_TASK_JOB}" "DEBUG: cpu_time $cpu_time" || true + if [[ -f "proc/${profiler_pid}" ]]; then + kill -s SIGINT "${profiler_pid}" || true fi } @@ -301,6 +301,7 @@ cylc__job_finish_err() { # (Ignore shellcheck "globbing and word splitting" warning here). # shellcheck disable=SC2086 trap '' ${CYLC_VACATION_SIGNALS:-} ${CYLC_FAIL_SIGNALS} + cylc__kill_profiler if [[ -n "${CYLC_TASK_MESSAGE_STARTED_PID:-}" ]]; then wait "${CYLC_TASK_MESSAGE_STARTED_PID}" 2>'/dev/null' || true fi diff --git a/cylc/flow/scripts/profile.py b/cylc/flow/scripts/profile.py deleted file mode 100755 index 8a7a1f235e3..00000000000 --- a/cylc/flow/scripts/profile.py +++ /dev/null @@ -1,177 +0,0 @@ -#!/usr/bin/env python3 -# THIS FILE IS PART OF THE CYLC WORKFLOW ENGINE. -# Copyright (C) NIWA & British Crown (Met Office) & Contributors. -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -"""cylc profiler [OPTIONS] - -Profiler which periodically polls PBS cgroups to track -the resource usage of jobs running on the node. -""" - -import os -import re -import sys -import time -import signal -import subprocess -from pathlib import Path -from dataclasses import dataclass -from cylc.flow.terminal import cli_function -from cylc.flow.option_parsers import CylcOptionParser as COP - -INTERNAL = True -PID_REGEX = re.compile(r"([^:]*\d{6,}.*)") - - -def get_option_parser() -> COP: - parser = COP( - __doc__, - argdoc=[ - ], - ) - parser.add_option( - "-i", type=int, help="interval between query cycles in seconds", - default=10, dest="delay") - parser.add_option( - "-m", type=str, help="Location of cgroups directory", - default="/sys/fs/cgroup", - dest="cgroup_location") - - return parser - - -@cli_function(get_option_parser) -def main(parser, options): - """CLI main.""" - # Register the stop_profiler function with the signal library - signal.signal(signal.SIGINT, stop_profiler) - - profile(options) - - -@dataclass -class Process: - """Class for representing CPU and Memory usage of a process""" - cgroup_memory_path: str - cgroup_cpu_path: str - - -def stop_profiler(*args): - """This function will be executed when the SIGINT signal is sent - to this process""" - print('profiler exited') - sys.exit(0) - - -def parse_memory_file(process): - """Open the memory stat file and copy the appropriate data""" - memory_stats = {} - - for line in open(process.cgroup_memory_path): - return int(line) // 1024 - - -def parse_cpu_file(process, cgroup_version): - """Open the memory stat file and return the appropriate data""" - memory_stats = {} - - if cgroup_version == 1: - for line in open(process.cgroup_cpu_path): - if "usage_usec" in line: - return int(re.findall(r'\d+', line)[0]) / 1000 - elif cgroup_version == 2: - for line in open(process.cgroup_cpu_path): - # Cgroups v2 uses nanoseconds - return int(line) / 1000000 - else: - raise FileNotFoundError("cpu usage files not found") - - -def write_data(data, filename): - try: - with open(filename, 'w') as f: - f.write(data + "\n") - except IOError: - raise IOError("Unable to write data to file:" + filename) - - -def get_cgroup_dir(): - """Get the cgroup directory for the current process""" - # Get the PID of the current process - pid = os.getpid() - # Get the cgroup information for the current process - result = subprocess.run(['cat', '/proc/' + str(pid) + '/cgroup'], capture_output=True, text=True) - result = PID_REGEX.search(result.stdout).group() - return result - - -def profile(args): - # Find the cgroup that this process is running in. - # Cylc will put this profiler in the same cgroup as the job it is profiling - cgroup_name = get_cgroup_dir() - - # HPC uses cgroups v2 and SPICE uses cgroups v1 - cgroup_version = None - - if Path.exists(Path(args.cgroup_location + cgroup_name)): - cgroup_version = 1 - elif Path.exists(Path(args.cgroup_location + "/memory" + cgroup_name)): - cgroup_version = 2 - else: - raise FileNotFoundError("cgroups not found:" + cgroup_name) - - peak_memory = 0 - processes = [] - - if cgroup_version == 1: - try: - processes.append(Process( - cgroup_memory_path=args.cgroup_location + cgroup_name + "/" + "memory.peak", - cgroup_cpu_path=args.cgroup_location + cgroup_name + "/" + "cpu.stat")) - except FileNotFoundError as e: - print(e) - raise FileNotFoundError("cgroups not found:" + args.cgroup_location) - - elif cgroup_version == 2: - try: - processes.append(Process( - cgroup_memory_path=args.cgroup_location + "/memory" + cgroup_name + "/memory.max_usage_in_bytes", - cgroup_cpu_path=args.cgroup_location + "/cpu" + cgroup_name + "/cpuacct.usage")) - except FileNotFoundError as e: - print(e) - raise FileNotFoundError("cgroups not found:" + args.cgroup_location) - - while True: - # Write memory usage data - for process in processes: - # Only save Max RSS to disk if it is above the previous value - try: - memory = parse_memory_file(process) - if memory > peak_memory: - peak_memory = memory - write_data(str(peak_memory), "max_rss") - cpu_time = parse_cpu_file(process, cgroup_version) - write_data(str(cpu_time), "cpu_time") - - except (OSError, IOError, ValueError) as error: - print(error) - - time.sleep(args.delay) - - -if __name__ == "__main__": - - arg_parser = get_option_parser() - profile(arg_parser.parse_args()) diff --git a/cylc/flow/scripts/profiler.py b/cylc/flow/scripts/profiler.py index 392a66569c6..2b86835b17f 100755 --- a/cylc/flow/scripts/profiler.py +++ b/cylc/flow/scripts/profiler.py @@ -16,7 +16,7 @@ # along with this program. If not, see . """cylc profiler [OPTIONS] -Profiler which periodically polls PBS cgroups to track +Profiler which periodically polls cgroups to track the resource usage of jobs running on the node. """ @@ -54,14 +54,14 @@ def get_option_parser() -> COP: @cli_function(get_option_parser) -def main(parser, options): +def main(parser: COP, options) -> None: """CLI main.""" # Register the stop_profiler function with the signal library signal.signal(signal.SIGINT, stop_profiler) signal.signal(signal.SIGHUP, stop_profiler) signal.signal(signal.SIGTERM, stop_profiler) - profile(options) + get_config(options) @dataclass @@ -78,117 +78,107 @@ def stop_profiler(*args): sys.exit(0) -def parse_memory_file(process): +def parse_memory_file(cgroup_memory_path): """Open the memory stat file and copy the appropriate data""" - with open(process.cgroup_memory_path, 'r') as f: + with open(cgroup_memory_path, 'r') as f: for line in f: return int(line) // 1024 -def parse_cpu_file(process, cgroup_version): +def parse_cpu_file(cgroup_cpu_path, cgroup_version): """Open the memory stat file and return the appropriate data""" if cgroup_version == 1: - with open(process.cgroup_cpu_path, 'r') as f: + with open(cgroup_cpu_path, 'r') as f: for line in f: if "usage_usec" in line: return int(RE_INT.findall(line)[0]) // 1000 elif cgroup_version == 2: - with open(process.cgroup_cpu_path, 'r') as f: + with open(cgroup_cpu_path, 'r') as f: for line in f: # Cgroups v2 uses nanoseconds return int(line) / 1000000 - else: - raise FileNotFoundError("cpu usage files not found") def write_data(data, filename): - try: - with open(filename, 'w') as f: - f.write(data + "\n") - except IOError as err: - raise IOError("Unable to write data to file:" + filename) from err + with open(filename, 'w') as f: + f.write(data + "\n") + +def get_cgroup_version(cgroup_location: Path, cgroup_name: Path) -> int: + # HPC uses cgroups v2 and SPICE uses cgroups v1 + if Path.exists(Path(cgroup_location + cgroup_name)): + return 1 + elif Path.exists(Path(cgroup_location + "/memory" + cgroup_name)): + return 2 -def get_cgroup_dir(): + +def get_cgroup_name(): """Get the cgroup directory for the current process""" # Get the PID of the current process pid = os.getpid() - # Get the cgroup information for the current process - result = subprocess.run(['cat', '/proc/' + str(pid) + '/cgroup'], - capture_output=True, text=True, shell=False) - if result.stderr: - print(result.stderr, file=sys.stderr) - result = PID_REGEX.search(result.stdout).group() - return result + try: + # Get the cgroup information for the current process + with open('/proc/' + str(pid) + '/cgroup', 'r') as f: + result = f.read() + result = PID_REGEX.search(result).group() + return result + except FileNotFoundError as err: + raise FileNotFoundError('/proc/' + str(pid) + '/cgroup not found') + + except AttributeError as err: + raise AttributeError("No cgroup found for process:", pid) from err + +def get_cgroup_paths(version, location, name): + + if version == 1: + return Process( + cgroup_memory_path=location + + name + "/" + "memory.peak", + cgroup_cpu_path=location + + name + "/" + "cpu.stat") + + elif version == 2: + return Process( + cgroup_memory_path=location + "/memory" + + name + "/memory.max_usage_in_bytes", + cgroup_cpu_path=location + "/cpu" + + name + "/cpuacct.usage") + + +def profile(process, version, delay, keep_looping=lambda: True): + # The infinite loop that will constantly poll the cgroup + # The lambda function is used to allow the loop to be stopped in unit tests + peak_memory = 0 + while keep_looping(): + # Write memory usage data + # Only save Max RSS to disk if it is above the previous value + cpu_time = parse_cpu_file(process.cgroup_cpu_path, version) + write_data(str(cpu_time), "cpu_time") + + memory = parse_memory_file(process.cgroup_memory_path) + if memory > peak_memory: + peak_memory = memory + write_data(str(peak_memory), "max_rss") + time.sleep(delay) -def profile(args): + +def get_config(args): # Find the cgroup that this process is running in. # Cylc will put this profiler in the same cgroup # as the job it is profiling - cgroup_name = get_cgroup_dir() - - # HPC uses cgroups v2 and SPICE uses cgroups v1 - cgroup_version = None - - if Path.exists(Path(args.cgroup_location + cgroup_name)): - cgroup_version = 1 - elif Path.exists(Path(args.cgroup_location + "/memory" + cgroup_name)): - cgroup_version = 2 - else: - raise FileNotFoundError("cgroups not found:" + cgroup_name) - - peak_memory = 0 - processes = [] - - if cgroup_version == 1: - try: - processes.append(Process( - cgroup_memory_path=args.cgroup_location + - cgroup_name + "/" + "memory.peak", - cgroup_cpu_path=args.cgroup_location + - cgroup_name + "/" + "cpu.stat")) - except FileNotFoundError as err: - print(err) - raise FileNotFoundError("cgroups not found:" - + args.cgroup_location) from err - - elif cgroup_version == 2: - try: - processes.append(Process( - cgroup_memory_path=args.cgroup_location + "/memory" + - cgroup_name + "/memory.max_usage_in_bytes", - cgroup_cpu_path=args.cgroup_location + "/cpu" + - cgroup_name + "/cpuacct.usage")) - except FileNotFoundError as err: - print(err) - raise FileNotFoundError("cgroups not found:" + - args.cgroup_location) from err - - while True: - failures = 0 - # Write memory usage data - for process in processes: - # Only save Max RSS to disk if it is above the previous value - try: - memory = parse_memory_file(process) - if memory > peak_memory: - peak_memory = memory - write_data(str(peak_memory), "max_rss") - cpu_time = parse_cpu_file(process, cgroup_version) - write_data(str(cpu_time), "cpu_time") - - except (OSError, ValueError) as error: - failures += 1 - if failures > 5: - raise OSError("cgroup polling failure", error) from error + cgroup_name = get_cgroup_name() + cgroup_version = get_cgroup_version(args.cgroup_location, cgroup_name) + process = get_cgroup_paths(cgroup_version, + args.cgroups_location, + cgroup_name) - time.sleep(args.delay) + profile(process, cgroup_version, args.delay) if __name__ == "__main__": arg_parser = get_option_parser() - profile(arg_parser.parse_args()) + get_config(arg_parser.parse_args([])) diff --git a/tests/functional/jobscript/02-profiler.t b/tests/functional/jobscript/02-profiler.t index 50e016c5ea0..e43705d4720 100644 --- a/tests/functional/jobscript/02-profiler.t +++ b/tests/functional/jobscript/02-profiler.t @@ -18,7 +18,7 @@ # cylc profile test . "$(dirname "$0")/test_header" #------------------------------------------------------------------------------- -set_test_number 3 +set_test_number 2 #------------------------------------------------------------------------------- install_workflow "${TEST_NAME_BASE}" "${TEST_NAME_BASE}" #------------------------------------------------------------------------------- @@ -34,13 +34,12 @@ fi export PATH_TO_CYLC_BIN="/path/to/cylc/bin" create_test_global_config ' [platforms] - [[profile]] + [[localhost]] + [[[profile]]] activate = true ' #------------------------------------------------------------------------------- TEST_NAME="${TEST_NAME_BASE}-run" workflow_run_ok "${TEST_NAME}" cylc play --reference-test --debug --no-detach "${WORKFLOW_NAME}" -grep_ok 'MAXRSS' "${WORKFLOW_RUN_DIR}/log/scheduler/log" - purge diff --git a/tests/unit/test_job_file.py b/tests/unit/test_job_file.py index 32bdcea2861..42a6a4cccea 100644 --- a/tests/unit/test_job_file.py +++ b/tests/unit/test_job_file.py @@ -65,11 +65,13 @@ def fixture_get_platform(): Returns: platforms dictionary. """ + def inner_func(custom_settings=None): platform = platform_from_name() if custom_settings is not None: platform.update(custom_settings) return platform + yield inner_func @@ -159,9 +161,9 @@ def test_write(fixture_get_platform): "execution_time_limit": 60 }, ('\n\n# DIRECTIVES:\n# @ job_name = farm_noises.baa.1' - '\n# @ output = directory/job.out\n# @ error = directory/' - 'job.err\n# @ wall_clock_limit = 120,60\n# @ moo = foo' - '\n# @ cluck = bar\n# @ queue') + '\n# @ output = directory/job.out\n# @ error = directory/' + 'job.err\n# @ wall_clock_limit = 120,60\n# @ moo = foo' + '\n# @ cluck = bar\n# @ queue') ), ( # Check no directives is correctly written @@ -217,7 +219,6 @@ def test_write(fixture_get_platform): "job_d": "1/test_task_id/01", "job_file_path": "$HOME/directory/job", "execution_time_limit": 1000 - }, ('\n\n# DIRECTIVES:\n#$ -N farm_noises.baa.1\n#$ -o directory/' 'job.out\n#$ -e directory/job.err\n#$ -l h_rt=0:16:40\n#$ -V\n#' @@ -383,7 +384,6 @@ def test_no_script_section_with_comment_only_script(): } with io.StringIO() as fake_file: - JobFileWriter()._write_script(fake_file, job_conf) blah = fake_file.getvalue() print(blah) @@ -404,7 +404,8 @@ def test_write_task_environment(): 'CYLC_TASK_PARAM_mouse="squeak"\n ' 'CYLC_TASK_WORK_DIR_BASE=\'farm_noises/work_d\'\n}') job_conf = { - "platform": {'communication method': 'ssh', + "platform": { + 'communication method': 'ssh', 'profile': { "activate": "true", } @@ -539,4 +540,3 @@ def test_homeless_platform(fixture_get_platform): job_sh_txt = job_sh.read() if 'HOME' in job_sh_txt: raise Exception('$HOME found in job.sh\n{job_sh_txt}') - From 47a4a981cc57df3e7bb5c0fc71e07afee3855678 Mon Sep 17 00:00:00 2001 From: Oliver Sanders Date: Wed, 19 Feb 2025 15:41:45 +0000 Subject: [PATCH 014/101] host-select: fix compatibility with force-condemned hosts --- cylc/flow/cfgspec/globalcfg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cylc/flow/cfgspec/globalcfg.py b/cylc/flow/cfgspec/globalcfg.py index 821b3fd3d1f..499cb29b09e 100644 --- a/cylc/flow/cfgspec/globalcfg.py +++ b/cylc/flow/cfgspec/globalcfg.py @@ -833,7 +833,7 @@ def default_for( preventing new workflows from starting on the "condemned" host. Any workflows running on these hosts will either migrate - to another host, or shut down according to + to another host, or shutdown according to :py:mod:`the configuration `. This feature requires ``auto restart`` to be listed From 356abffe3ec8a7bab4ffccf19209e25d18f3963c Mon Sep 17 00:00:00 2001 From: Oliver Sanders Date: Tue, 18 Mar 2025 11:31:13 +0000 Subject: [PATCH 015/101] Update cylc/flow/cfgspec/globalcfg.py Co-authored-by: Hilary James Oliver --- cylc/flow/cfgspec/globalcfg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cylc/flow/cfgspec/globalcfg.py b/cylc/flow/cfgspec/globalcfg.py index 499cb29b09e..821b3fd3d1f 100644 --- a/cylc/flow/cfgspec/globalcfg.py +++ b/cylc/flow/cfgspec/globalcfg.py @@ -833,7 +833,7 @@ def default_for( preventing new workflows from starting on the "condemned" host. Any workflows running on these hosts will either migrate - to another host, or shutdown according to + to another host, or shut down according to :py:mod:`the configuration `. This feature requires ``auto restart`` to be listed From 6ed177011f869daf77f2869f2c088db599af1452 Mon Sep 17 00:00:00 2001 From: Oliver Sanders Date: Thu, 13 Mar 2025 09:44:47 +0000 Subject: [PATCH 016/101] tests: convert unittest to pytest --- tests/unit/test_subprocpool.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/tests/unit/test_subprocpool.py b/tests/unit/test_subprocpool.py index 1f876fd83ef..5357caa5889 100644 --- a/tests/unit/test_subprocpool.py +++ b/tests/unit/test_subprocpool.py @@ -79,12 +79,9 @@ def test_run_command_writes_to_out(): def test_run_command_writes_to_err(): """Test basic usage, command writes to STDERR""" - ctx = SubProcContext( - 'parrot2', - ['bash', '--noprofile', '--norc', '-c', 'echo pirate errrr >&2'] - ) + ctx = SubProcContext('parrot2', ['bash', '-c', 'echo pirate errrr >&2']) SubProcPool.run_command(ctx) - assert 'pirate errrr\n' + assert ctx.err == 'pirate errrr\n' assert ctx.out == '' assert ctx.ret_code == 0 From 4cfcffeff85f3681b9c09ffa0f4ae84460d55943 Mon Sep 17 00:00:00 2001 From: Oliver Sanders Date: Thu, 13 Mar 2025 09:53:36 +0000 Subject: [PATCH 017/101] tests/u: test_subprocpool.py::test_run_command_writes_to_err * Attempt to fix flaky test. * Cut out shell profile files to omit some spurious stderr. --- tests/unit/test_subprocpool.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/unit/test_subprocpool.py b/tests/unit/test_subprocpool.py index 5357caa5889..1f876fd83ef 100644 --- a/tests/unit/test_subprocpool.py +++ b/tests/unit/test_subprocpool.py @@ -79,9 +79,12 @@ def test_run_command_writes_to_out(): def test_run_command_writes_to_err(): """Test basic usage, command writes to STDERR""" - ctx = SubProcContext('parrot2', ['bash', '-c', 'echo pirate errrr >&2']) + ctx = SubProcContext( + 'parrot2', + ['bash', '--noprofile', '--norc', '-c', 'echo pirate errrr >&2'] + ) SubProcPool.run_command(ctx) - assert ctx.err == 'pirate errrr\n' + assert 'pirate errrr\n' assert ctx.out == '' assert ctx.ret_code == 0 From 1e5b804d35425dd3c2592225a182f2c0b5f64fe4 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Fri, 21 Mar 2025 13:24:13 +0000 Subject: [PATCH 018/101] Linting --- tests/unit/test_job_file.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/unit/test_job_file.py b/tests/unit/test_job_file.py index 42a6a4cccea..88a177fa0ac 100644 --- a/tests/unit/test_job_file.py +++ b/tests/unit/test_job_file.py @@ -406,10 +406,10 @@ def test_write_task_environment(): job_conf = { "platform": { 'communication method': 'ssh', - 'profile': { - "activate": "true", - } - }, + 'profile': { + "activate": "true", + } + }, "job_d": "1/moo/01", "namespace_hierarchy": ["baa", "moo"], "dependencies": ['moo', 'neigh', 'quack'], From 38c31f56d3fcf9326e9bd6071b07d81a2803294e Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Mon, 24 Mar 2025 08:49:44 +0000 Subject: [PATCH 019/101] Fail gracefully if cgroups cannot be found Revert "Fail gracefully if cgroups cannot be found" This reverts commit 92e1e11c9b392b4742501d399f191f590814e95e. Adding profiler unit tests updating tests --- .github/workflows/1_create_release_pr.yml | 13 - .github/workflows/2_auto_publish_release.yml | 11 - .github/workflows/bash.yml | 8 - .github/workflows/build.yml | 34 +- .github/workflows/test_conda-build.yml | 7 - .github/workflows/test_fast.yml | 7 +- .github/workflows/test_functional.yml | 4 - .github/workflows/test_tutorial_workflow.yml | 8 - cylc/flow/cfgspec/globalcfg.py | 46 +- cylc/flow/etc/job.sh | 1 - cylc/flow/host_select.py | 11 +- cylc/flow/scripts/profiler.py | 139 +++--- .../43-auto-restart-force-override-normal.t | 5 +- tests/unit/scripts/test_profiler.py | 50 ++ tests/unit/test_subprocpool.py | 445 +++++++++--------- 15 files changed, 363 insertions(+), 426 deletions(-) diff --git a/.github/workflows/1_create_release_pr.yml b/.github/workflows/1_create_release_pr.yml index db8374e0b54..10a700b7ab4 100644 --- a/.github/workflows/1_create_release_pr.yml +++ b/.github/workflows/1_create_release_pr.yml @@ -11,19 +11,6 @@ on: required: false default: 'master' -concurrency: - # Only let this run 1 at a time - group: ${{ github.workflow }} - cancel-in-progress: false - -defaults: - run: - shell: bash - -env: - FORCE_COLOR: 2 - PIP_PROGRESS_BAR: off - jobs: create-release-pr: runs-on: ubuntu-latest diff --git a/.github/workflows/2_auto_publish_release.yml b/.github/workflows/2_auto_publish_release.yml index a2b84a46c4d..943c54090d9 100644 --- a/.github/workflows/2_auto_publish_release.yml +++ b/.github/workflows/2_auto_publish_release.yml @@ -7,18 +7,7 @@ on: # NOTE: While this is too generic, we use the `if` condition of the job to narrow it down # NOTE: Don't use `branches` as we might create release on any branch -concurrency: - # Only let this run 1 at a time - group: ${{ github.workflow }} - cancel-in-progress: false - -defaults: - run: - shell: bash - env: - FORCE_COLOR: 2 - PIP_PROGRESS_BAR: off # Best not to include the GH token here, only do it for the steps that need it MERGE_SHA: ${{ github.event.pull_request.merge_commit_sha }} CHANGELOG_FILE: CHANGES.md diff --git a/.github/workflows/bash.yml b/.github/workflows/bash.yml index d0970de9778..b7c97cd21e9 100644 --- a/.github/workflows/bash.yml +++ b/.github/workflows/bash.yml @@ -31,14 +31,6 @@ concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true -defaults: - run: - shell: bash - -env: - FORCE_COLOR: 2 - PIP_PROGRESS_BAR: off - jobs: bash-docker: runs-on: ubuntu-latest diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 6d12e5b1145..2f09c9cbf39 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -10,20 +10,6 @@ on: - 'MANIFEST.in' # check packaging - 'pyproject.toml' # check build config - 'setup.cfg' # check deps and project config - - '.gitignore' - - '.github/workflows/build.yml' - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -defaults: - run: - shell: bash -leo pipefail {0} - -env: - FORCE_COLOR: 2 - PIP_PROGRESS_BAR: off jobs: test: @@ -32,23 +18,21 @@ jobs: strategy: fail-fast: false matrix: - os: ['ubuntu-latest', 'macos-latest'] - python: ['3.7', '3.8', '3.9', '3.10', '3'] - exclude: - - os: 'macos-latest' + os: ['ubuntu-latest'] + python: ['3.8', '3.9', '3.10', '3.11'] + include: + - os: 'ubuntu-22.04' python: '3.7' + - os: 'macos-latest' + python: '3.8' steps: - name: Checkout uses: actions/checkout@v4 - name: Setup Python - uses: mamba-org/setup-micromamba@v2 + uses: actions/setup-python@v5 with: - cache-environment: true - post-cleanup: 'all' - environment-name: cylc-build - create-args: >- - python=${{ matrix.python }} + python-version: ${{ matrix.python }} - name: Build uses: cylc/release-actions/build-python-package@v1 @@ -56,7 +40,7 @@ jobs: - name: Inspect run: | unzip -l dist/*.whl | tee files - grep -E 'cylc_flow.*.dist-info/.*COPYING' files + grep 'cylc_flow.*.dist-info/COPYING' files grep 'cylc/flow/py.typed' files grep 'cylc/flow/etc' files grep 'cylc/flow/etc/cylc-completion.bash' files diff --git a/.github/workflows/test_conda-build.yml b/.github/workflows/test_conda-build.yml index 619981f2ac4..b4e97117b1a 100644 --- a/.github/workflows/test_conda-build.yml +++ b/.github/workflows/test_conda-build.yml @@ -13,13 +13,6 @@ concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true -defaults: - run: - shell: bash - -env: - FORCE_COLOR: 2 - jobs: test_conda_install: if: github.repository_owner == 'cylc' || github.event_name != 'schedule' diff --git a/.github/workflows/test_fast.yml b/.github/workflows/test_fast.yml index 95e8a5754c9..9847b71a454 100644 --- a/.github/workflows/test_fast.yml +++ b/.github/workflows/test_fast.yml @@ -16,9 +16,6 @@ defaults: run: shell: bash -c "exec $CONDA_PREFIX/bin/bash -elo pipefail {0}" -env: - PIP_PROGRESS_BAR: off - jobs: test: runs-on: ${{ matrix.os }} @@ -35,9 +32,11 @@ jobs: - os: 'ubuntu-latest' python-version: '3.9' # not the oldest, not the most recent version time-zone: 'XXX-09:35' + env: TZ: ${{ matrix.time-zone }} PYTEST_ADDOPTS: --cov --cov-append -n 5 --color=yes + steps: - name: Checkout uses: actions/checkout@v4 @@ -107,8 +106,6 @@ jobs: strategy: matrix: python-version: ['3'] - env: - FORCE_COLOR: 2 steps: - name: Checkout uses: actions/checkout@v4 diff --git a/.github/workflows/test_functional.yml b/.github/workflows/test_functional.yml index d85d3a60c9b..aa7ebb8de60 100644 --- a/.github/workflows/test_functional.yml +++ b/.github/workflows/test_functional.yml @@ -36,10 +36,6 @@ defaults: run: shell: bash -c "exec $CONDA_PREFIX/bin/bash -elo pipefail {0}" -env: - FORCE_COLOR: 2 - PIP_PROGRESS_BAR: off - jobs: test: runs-on: ${{ matrix.os }} diff --git a/.github/workflows/test_tutorial_workflow.yml b/.github/workflows/test_tutorial_workflow.yml index 01808ca4871..3faa8469ef4 100644 --- a/.github/workflows/test_tutorial_workflow.yml +++ b/.github/workflows/test_tutorial_workflow.yml @@ -17,14 +17,6 @@ concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true -defaults: - run: - shell: bash - -env: - FORCE_COLOR: 2 - PIP_PROGRESS_BAR: off - jobs: test: strategy: diff --git a/cylc/flow/cfgspec/globalcfg.py b/cylc/flow/cfgspec/globalcfg.py index 821b3fd3d1f..6d6dc23f717 100644 --- a/cylc/flow/cfgspec/globalcfg.py +++ b/cylc/flow/cfgspec/globalcfg.py @@ -826,52 +826,16 @@ def default_for( range. ''') Conf('condemned', VDR.V_ABSOLUTE_HOST_LIST, desc=f''' - List run hosts that workflows should *not* run on. + These hosts will not be used to run jobs. - These hosts will be subtracted from the - `available ` hosts - preventing new workflows from starting on the "condemned" host. - - Any workflows running on these hosts will either migrate - to another host, or shut down according to - :py:mod:`the configuration `. - - This feature requires ``auto restart`` to be listed - in `global.cylc[scheduler][main loop]plugins`. - - For more information, see the - :py:mod:`auto restart ` - plugin. - - .. rubric:: Example: - - .. code-block:: cylc - - [scheduler] - [[main loop]] - # activate the "auto restart" plugin - plugins = auto restart - [[run hosts]] - # there are three hosts in the "pool" - available = host1, host2, host3 - - # however two have been taken out: - # * workflows running on "host1" will attempt to - # restart on "host3" - # * workflows running on "host2" will shutdown - condemned = host1, host2! + If workflows are already running on + condemned hosts, Cylc will shut them down and + restart them on different hosts. .. seealso:: - :py:mod:`cylc.flow.main_loop.auto_restart` :ref:`auto-stop-restart` - .. versionchanged:: 8.4.2 - - The "force mode" (activated by a "!" suffix) caused issues - at workflow startup for Cylc versions between 8.0.0 and - 8.4.1 inclusive. - .. versionchanged:: 8.0.0 {REPLACES}``[suite servers]condemned hosts``. @@ -1381,7 +1345,7 @@ def default_for( The means by which task progress messages are reported back to the running workflow. - .. rubric:: Options: + ..rubric:: Options: zmq Direct client-server TCP communication via network ports diff --git a/cylc/flow/etc/job.sh b/cylc/flow/etc/job.sh index c0837bfeba6..a4747579538 100755 --- a/cylc/flow/etc/job.sh +++ b/cylc/flow/etc/job.sh @@ -293,7 +293,6 @@ cylc__job__run_inst_func() { # Returns: # exit ${CYLC_TASK_USER_SCRIPT_EXITCODE} cylc__job_finish_err() { - cylc__kill_profiler CYLC_TASK_USER_SCRIPT_EXITCODE="${CYLC_TASK_USER_SCRIPT_EXITCODE:-$?}" typeset signal="$1" typeset run_err_script="$2" diff --git a/cylc/flow/host_select.py b/cylc/flow/host_select.py index cf940864b90..69e32c68a71 100644 --- a/cylc/flow/host_select.py +++ b/cylc/flow/host_select.py @@ -128,13 +128,6 @@ def select_workflow_host(cached=True): # be returned with the up-to-date configuration. global_config = glbl_cfg(cached=cached) - # condemned hosts may be suffixed with an "!" to activate "force mode" - blacklist = [] - for host in global_config.get(['scheduler', 'run hosts', 'condemned'], []): - if host.endswith('!'): - host = host[:-1] - blacklist.append(host) - return select_host( # list of workflow hosts global_config.get([ @@ -145,7 +138,9 @@ def select_workflow_host(cached=True): 'scheduler', 'run hosts', 'ranking' ]), # list of condemned hosts - blacklist=blacklist, + blacklist=global_config.get( + ['scheduler', 'run hosts', 'condemned'] + ), blacklist_name='condemned host' ) diff --git a/cylc/flow/scripts/profiler.py b/cylc/flow/scripts/profiler.py index 2b86835b17f..060d4f67e39 100755 --- a/cylc/flow/scripts/profiler.py +++ b/cylc/flow/scripts/profiler.py @@ -16,7 +16,7 @@ # along with this program. If not, see . """cylc profiler [OPTIONS] -Profiler which periodically polls cgroups to track +Profiler which periodically polls PBS cgroups to track the resource usage of jobs running on the node. """ @@ -25,7 +25,6 @@ import sys import time import signal -import subprocess from pathlib import Path from dataclasses import dataclass from cylc.flow.terminal import cli_function @@ -54,14 +53,14 @@ def get_option_parser() -> COP: @cli_function(get_option_parser) -def main(parser: COP, options) -> None: +def main(parser, options): """CLI main.""" # Register the stop_profiler function with the signal library signal.signal(signal.SIGINT, stop_profiler) signal.signal(signal.SIGHUP, stop_profiler) signal.signal(signal.SIGTERM, stop_profiler) - get_config(options) + profile(options) @dataclass @@ -78,43 +77,40 @@ def stop_profiler(*args): sys.exit(0) -def parse_memory_file(cgroup_memory_path): +def parse_memory_file(process): """Open the memory stat file and copy the appropriate data""" - with open(cgroup_memory_path, 'r') as f: + with open(process.cgroup_memory_path, 'r') as f: for line in f: return int(line) // 1024 -def parse_cpu_file(cgroup_cpu_path, cgroup_version): +def parse_cpu_file(process, cgroup_version): """Open the memory stat file and return the appropriate data""" if cgroup_version == 1: - with open(cgroup_cpu_path, 'r') as f: + with open(process.cgroup_cpu_path, 'r') as f: for line in f: if "usage_usec" in line: return int(RE_INT.findall(line)[0]) // 1000 elif cgroup_version == 2: - with open(cgroup_cpu_path, 'r') as f: + with open(process.cgroup_cpu_path, 'r') as f: for line in f: # Cgroups v2 uses nanoseconds return int(line) / 1000000 + else: + raise FileNotFoundError("cpu usage files not found") def write_data(data, filename): - with open(filename, 'w') as f: - f.write(data + "\n") - - -def get_cgroup_version(cgroup_location: Path, cgroup_name: Path) -> int: - # HPC uses cgroups v2 and SPICE uses cgroups v1 - if Path.exists(Path(cgroup_location + cgroup_name)): - return 1 - elif Path.exists(Path(cgroup_location + "/memory" + cgroup_name)): - return 2 + try: + with open(filename, 'w') as f: + f.write(data + "\n") + except IOError as err: + raise IOError("Unable to write data to file:" + filename) from err -def get_cgroup_name(): +def get_cgroup_dir(): """Get the cgroup directory for the current process""" # Get the PID of the current process pid = os.getpid() @@ -125,60 +121,79 @@ def get_cgroup_name(): result = PID_REGEX.search(result).group() return result except FileNotFoundError as err: - raise FileNotFoundError('/proc/' + str(pid) + '/cgroup not found') - + print(err) + print('/proc/' + str(pid) + '/cgroup not found') + exit() except AttributeError as err: - raise AttributeError("No cgroup found for process:", pid) from err + print(err) + print("No cgroup found for process") + exit() -def get_cgroup_paths(version, location, name): - if version == 1: - return Process( - cgroup_memory_path=location + - name + "/" + "memory.peak", - cgroup_cpu_path=location + - name + "/" + "cpu.stat") +def profile(args): + # Find the cgroup that this process is running in. + # Cylc will put this profiler in the same cgroup + # as the job it is profiling + cgroup_name = get_cgroup_dir() - elif version == 2: - return Process( - cgroup_memory_path=location + "/memory" + - name + "/memory.max_usage_in_bytes", - cgroup_cpu_path=location + "/cpu" + - name + "/cpuacct.usage") + # HPC uses cgroups v2 and SPICE uses cgroups v1 + cgroup_version = None + if Path.exists(Path(args.cgroup_location + cgroup_name)): + cgroup_version = 1 + elif Path.exists(Path(args.cgroup_location + "/memory" + cgroup_name)): + cgroup_version = 2 + else: + raise FileNotFoundError("cgroups not found:" + cgroup_name) -def profile(process, version, delay, keep_looping=lambda: True): - # The infinite loop that will constantly poll the cgroup - # The lambda function is used to allow the loop to be stopped in unit tests peak_memory = 0 - while keep_looping(): - # Write memory usage data - # Only save Max RSS to disk if it is above the previous value - cpu_time = parse_cpu_file(process.cgroup_cpu_path, version) - write_data(str(cpu_time), "cpu_time") - - memory = parse_memory_file(process.cgroup_memory_path) - if memory > peak_memory: - peak_memory = memory - write_data(str(peak_memory), "max_rss") - - time.sleep(delay) + processes = [] + if cgroup_version == 1: + try: + processes.append(Process( + cgroup_memory_path=args.cgroup_location + + cgroup_name + "/" + "memory.peak", + cgroup_cpu_path=args.cgroup_location + + cgroup_name + "/" + "cpu.stat")) + except FileNotFoundError as err: + print(err) + raise FileNotFoundError("cgroups not found:" + + args.cgroup_location) from err + elif cgroup_version == 2: + try: + processes.append(Process( + cgroup_memory_path=args.cgroup_location + "/memory" + + cgroup_name + "/memory.max_usage_in_bytes", + cgroup_cpu_path=args.cgroup_location + "/cpu" + + cgroup_name + "/cpuacct.usage")) + except FileNotFoundError as err: + print(err) + raise FileNotFoundError("cgroups not found:" + + args.cgroup_location) from err + + while True: + failures = 0 + # Write memory usage data + for process in processes: + # Only save Max RSS to disk if it is above the previous value + try: + memory = parse_memory_file(process) + if memory > peak_memory: + peak_memory = memory + write_data(str(peak_memory), "max_rss") + cpu_time = parse_cpu_file(process, cgroup_version) + write_data(str(cpu_time), "cpu_time") -def get_config(args): - # Find the cgroup that this process is running in. - # Cylc will put this profiler in the same cgroup - # as the job it is profiling - cgroup_name = get_cgroup_name() - cgroup_version = get_cgroup_version(args.cgroup_location, cgroup_name) - process = get_cgroup_paths(cgroup_version, - args.cgroups_location, - cgroup_name) + except (OSError, ValueError) as error: + failures += 1 + if failures > 5: + raise OSError("cgroup polling failure", error) from error - profile(process, cgroup_version, args.delay) + time.sleep(args.delay) if __name__ == "__main__": arg_parser = get_option_parser() - get_config(arg_parser.parse_args([])) + profile(arg_parser.parse_args([])) diff --git a/tests/functional/restart/43-auto-restart-force-override-normal.t b/tests/functional/restart/43-auto-restart-force-override-normal.t index 35edc57d1f9..b61d08c68cb 100644 --- a/tests/functional/restart/43-auto-restart-force-override-normal.t +++ b/tests/functional/restart/43-auto-restart-force-override-normal.t @@ -50,10 +50,7 @@ create_test_global_config '' " ${BASE_GLOBAL_CONFIG} [scheduler] [[run hosts]] - available = ${CYLC_TEST_HOST_1}, ${CYLC_TEST_HOST_2} - # ensure the workflow can start if a host is condemned - # in force mode see #6623 - condemned = ${CYLC_TEST_HOST_2}! + available = ${CYLC_TEST_HOST_1} " set_test_number 8 diff --git a/tests/unit/scripts/test_profiler.py b/tests/unit/scripts/test_profiler.py index f8560eafd9a..4aa49a22046 100644 --- a/tests/unit/scripts/test_profiler.py +++ b/tests/unit/scripts/test_profiler.py @@ -153,6 +153,56 @@ def test_profile_max_rss(mocker): mock_file.assert_called_with("max_rss", "w") +def test_get_cgroup_version(mocker): + + # Mock the Path.exists function call to return True + mocker.patch("pathlib.Path.exists", return_value=True) + assert get_cgroup_version('stuff/in/place', 'more_stuff') == 1 + + with mock.patch('pathlib.Path.exists', side_effect=[False, True]): + assert get_cgroup_version('stuff/in/place', 'more_stuff') == 2 + + # Mock the Path.exists function call to return False + mocker.patch("pathlib.Path.exists", return_value=False) + assert get_cgroup_version('stuff/in/other/place', 'things') is None + + +def test_get_cgroup_paths(): + + process = get_cgroup_paths(1, "test_location/", "test_name") + assert process.cgroup_memory_path == "test_location/test_name/memory.peak" + assert process.cgroup_cpu_path == "test_location/test_name/cpu.stat" + + process = get_cgroup_paths(2, "test_location", "/test_name") + assert (process.cgroup_memory_path == + "test_location/memory/test_name/memory.max_usage_in_bytes") + assert process.cgroup_cpu_path == "test_location/cpu/test_name/cpuacct.usage" + + +def test_profile_cpu(mocker): + process = get_cgroup_paths(1, "test_location/", "test_name") + + mock_file = mocker.mock_open(read_data="") + mocker.patch("builtins.open", mock_file) + mocker.patch("cylc.flow.scripts.profiler.parse_memory_file", return_value=0) + mocker.patch("cylc.flow.scripts.profiler.parse_cpu_file", return_value=2048) + run_once = mock.Mock(side_effect=[True, False]) + profile(process, 1, 1, run_once) + mock_file.assert_called_with("cpu_time", "w") + + +def test_profile_max_rss(mocker): + process = get_cgroup_paths(1, "test_location/", "test_name") + + mock_file = mocker.mock_open(read_data="") + mocker.patch("builtins.open", mock_file) + mocker.patch("cylc.flow.scripts.profiler.parse_memory_file", return_value=1024) + mocker.patch("cylc.flow.scripts.profiler.parse_cpu_file", return_value=2048) + run_once = mock.Mock(side_effect=[True, False]) + profile(process, 1, 1, run_once) + mock_file.assert_called_with("max_rss", "w") + + def test_stop_profiler(): with pytest.raises(SystemExit) as pytest_wrapped_e: stop_profiler() diff --git a/tests/unit/test_subprocpool.py b/tests/unit/test_subprocpool.py index 1f876fd83ef..6b686c27082 100644 --- a/tests/unit/test_subprocpool.py +++ b/tests/unit/test_subprocpool.py @@ -14,17 +14,16 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . -from pathlib import Path -from types import SimpleNamespace from tempfile import ( - NamedTemporaryFile, - SpooledTemporaryFile, - TemporaryFile, - TemporaryDirectory, + NamedTemporaryFile, SpooledTemporaryFile, TemporaryFile, + TemporaryDirectory ) - +import unittest import pytest +from pathlib import Path +from types import SimpleNamespace + from cylc.flow import LOG from cylc.flow.id import Tokens from cylc.flow.cycling.iso8601 import ISO8601Point @@ -45,202 +44,188 @@ from cylc.flow.task_proxy import TaskProxy -def test_get_temporary_file(): - """Test SubProcPool.get_temporary_file.""" - assert isinstance(SubProcPool.get_temporary_file(), SpooledTemporaryFile) - - -def test_run_command_returns_0(): - """Test basic usage, command returns 0""" - ctx = SubProcContext('truth', ['true']) - SubProcPool.run_command(ctx) - assert ctx.err == '' - assert ctx.out == '' - assert ctx.ret_code == 0 - - -def test_run_command_returns_1(): - """Test basic usage, command returns 1""" - ctx = SubProcContext('lies', ['false']) - SubProcPool.run_command(ctx) - assert ctx.err == '' - assert ctx.out == '' - assert ctx.ret_code == 1 - - -def test_run_command_writes_to_out(): - """Test basic usage, command writes to STDOUT""" - ctx = SubProcContext('parrot', ['echo', 'pirate', 'urrrr']) - SubProcPool.run_command(ctx) - assert ctx.err == '' - assert ctx.out == 'pirate urrrr\n' - assert ctx.ret_code == 0 - - -def test_run_command_writes_to_err(): - """Test basic usage, command writes to STDERR""" - ctx = SubProcContext( - 'parrot2', - ['bash', '--noprofile', '--norc', '-c', 'echo pirate errrr >&2'] - ) - SubProcPool.run_command(ctx) - assert 'pirate errrr\n' - assert ctx.out == '' - assert ctx.ret_code == 0 - - -def test_run_command_with_stdin_from_str(): - """Test STDIN from string""" - ctx = SubProcContext('meow', ['cat'], stdin_str='catches mice.\n') - SubProcPool.run_command(ctx) - assert ctx.err == '' - assert ctx.out == 'catches mice.\n' - assert ctx.ret_code == 0 - - -def test_run_command_with_stdin_from_unicode(): - """Test STDIN from string with Unicode""" - ctx = SubProcContext('meow', ['cat'], stdin_str='喵\n') - SubProcPool.run_command(ctx) - assert ctx.err == '' - assert ctx.out == '喵\n' - assert ctx.ret_code == 0 - - -def test_run_command_with_stdin_from_handle(): - """Test STDIN from a single opened file handle""" - handle = TemporaryFile() - handle.write('catches mice.\n'.encode('UTF-8')) - handle.seek(0) - ctx = SubProcContext('meow', ['cat'], stdin_files=[handle]) - SubProcPool.run_command(ctx) - assert ctx.err == '' - assert ctx.out == 'catches mice.\n' - assert ctx.ret_code == 0 - handle.close() - - -def test_run_command_with_stdin_from_path(): - """Test STDIN from a single file path""" - handle = NamedTemporaryFile() - handle.write('catches mice.\n'.encode('UTF-8')) - handle.seek(0) - ctx = SubProcContext('meow', ['cat'], stdin_files=[handle.name]) - SubProcPool.run_command(ctx) - assert ctx.err == '' - assert ctx.out == 'catches mice.\n' - assert ctx.ret_code == 0 - handle.close() - - -def test_run_command_with_stdin_from_handles(): - """Test STDIN from multiple file handles""" - handles = [] - for txt in ['catches mice.\n', 'eat fish.\n']: +class TestSubProcPool(unittest.TestCase): + + def test_get_temporary_file(self): + """Test SubProcPool.get_temporary_file.""" + self.assertIsInstance( + SubProcPool.get_temporary_file(), SpooledTemporaryFile) + + def test_run_command_returns_0(self): + """Test basic usage, command returns 0""" + ctx = SubProcContext('truth', ['true']) + SubProcPool.run_command(ctx) + self.assertEqual(ctx.err, '') + self.assertEqual(ctx.out, '') + self.assertEqual(ctx.ret_code, 0) + + def test_run_command_returns_1(self): + """Test basic usage, command returns 1""" + ctx = SubProcContext('lies', ['false']) + SubProcPool.run_command(ctx) + self.assertEqual(ctx.err, '') + self.assertEqual(ctx.out, '') + self.assertEqual(ctx.ret_code, 1) + + def test_run_command_writes_to_out(self): + """Test basic usage, command writes to STDOUT""" + ctx = SubProcContext('parrot', ['echo', 'pirate', 'urrrr']) + SubProcPool.run_command(ctx) + self.assertEqual(ctx.err, '') + self.assertEqual(ctx.out, 'pirate urrrr\n') + self.assertEqual(ctx.ret_code, 0) + + def test_run_command_writes_to_err(self): + """Test basic usage, command writes to STDERR""" + ctx = SubProcContext( + 'parrot2', ['bash', '-c', 'echo pirate errrr >&2']) + SubProcPool.run_command(ctx) + self.assertEqual(ctx.err, 'pirate errrr\n') + self.assertEqual(ctx.out, '') + self.assertEqual(ctx.ret_code, 0) + + def test_run_command_with_stdin_from_str(self): + """Test STDIN from string""" + ctx = SubProcContext('meow', ['cat'], stdin_str='catches mice.\n') + SubProcPool.run_command(ctx) + self.assertEqual(ctx.err, '') + self.assertEqual(ctx.out, 'catches mice.\n') + self.assertEqual(ctx.ret_code, 0) + + def test_run_command_with_stdin_from_unicode(self): + """Test STDIN from string with Unicode""" + ctx = SubProcContext('meow', ['cat'], stdin_str='喵\n') + SubProcPool.run_command(ctx) + self.assertEqual(ctx.err, '') + self.assertEqual(ctx.out, '喵\n') + self.assertEqual(ctx.ret_code, 0) + + def test_run_command_with_stdin_from_handle(self): + """Test STDIN from a single opened file handle""" handle = TemporaryFile() - handle.write(txt.encode('UTF-8')) + handle.write('catches mice.\n'.encode('UTF-8')) handle.seek(0) - handles.append(handle) - ctx = SubProcContext('meow', ['cat'], stdin_files=handles) - SubProcPool.run_command(ctx) - assert ctx.err == '' - assert ctx.out == 'catches mice.\neat fish.\n' - assert ctx.ret_code == 0 - for handle in handles: + ctx = SubProcContext('meow', ['cat'], stdin_files=[handle]) + SubProcPool.run_command(ctx) + self.assertEqual(ctx.err, '') + self.assertEqual(ctx.out, 'catches mice.\n') + self.assertEqual(ctx.ret_code, 0) handle.close() - -def test_run_command_with_stdin_from_paths(): - """Test STDIN from multiple file paths""" - handles = [] - for txt in ['catches mice.\n', 'eat fish.\n']: + def test_run_command_with_stdin_from_path(self): + """Test STDIN from a single file path""" handle = NamedTemporaryFile() - handle.write(txt.encode('UTF-8')) + handle.write('catches mice.\n'.encode('UTF-8')) handle.seek(0) - handles.append(handle) - ctx = SubProcContext( - 'meow', ['cat'], stdin_files=[handle.name for handle in handles] - ) - SubProcPool.run_command(ctx) - assert ctx.err == '' - assert ctx.out == 'catches mice.\neat fish.\n' - assert ctx.ret_code == 0 - for handle in handles: + ctx = SubProcContext('meow', ['cat'], stdin_files=[handle.name]) + SubProcPool.run_command(ctx) + self.assertEqual(ctx.err, '') + self.assertEqual(ctx.out, 'catches mice.\n') + self.assertEqual(ctx.ret_code, 0) handle.close() - -def test_xfunction(): - """Test xtrigger function import.""" - with TemporaryDirectory() as temp_dir: - python_dir = Path(temp_dir, "lib", "python") - python_dir.mkdir(parents=True) - the_answer_file = python_dir / "the_answer.py" - with the_answer_file.open(mode="w") as f: - f.write("""the_answer = lambda: 42""") - f.flush() - f_name = "the_answer" - fn = get_xtrig_func(f_name, f_name, temp_dir) - result = fn() - assert 42 == result - - -def test_xfunction_cache(): - """Test xtrigger function import cache.""" - with TemporaryDirectory() as temp_dir: - python_dir = Path(temp_dir, "lib", "python") - python_dir.mkdir(parents=True) - amandita_file = python_dir / "amandita.py" - with amandita_file.open(mode="w") as f: - f.write("""choco = lambda: 'chocolate'""") - f.flush() - m_name = "amandita" # module - f_name = "choco" # function - fn = get_xtrig_func(m_name, f_name, temp_dir) - result = fn() - assert 'chocolate' == result - - # is in the cache - assert (m_name, f_name) in _XTRIG_FUNC_CACHE - # returned from cache - assert fn, get_xtrig_func(m_name, f_name == temp_dir) - - -def test_xfunction_import_error(): - """Test for error on importing a xtrigger function. - - To prevent the test eventually failing if the test function is added - and successfully imported, we use an invalid module name as per Python - spec. - """ - with TemporaryDirectory() as temp_dir: - with pytest.raises(ModuleNotFoundError): + def test_run_command_with_stdin_from_handles(self): + """Test STDIN from multiple file handles""" + handles = [] + for txt in ['catches mice.\n', 'eat fish.\n']: + handle = TemporaryFile() + handle.write(txt.encode('UTF-8')) + handle.seek(0) + handles.append(handle) + ctx = SubProcContext('meow', ['cat'], stdin_files=handles) + SubProcPool.run_command(ctx) + self.assertEqual(ctx.err, '') + self.assertEqual(ctx.out, 'catches mice.\neat fish.\n') + self.assertEqual(ctx.ret_code, 0) + for handle in handles: + handle.close() + + def test_run_command_with_stdin_from_paths(self): + """Test STDIN from multiple file paths""" + handles = [] + for txt in ['catches mice.\n', 'eat fish.\n']: + handle = NamedTemporaryFile() + handle.write(txt.encode('UTF-8')) + handle.seek(0) + handles.append(handle) + ctx = SubProcContext( + 'meow', ['cat'], stdin_files=[handle.name for handle in handles]) + SubProcPool.run_command(ctx) + self.assertEqual(ctx.err, '') + self.assertEqual(ctx.out, 'catches mice.\neat fish.\n') + self.assertEqual(ctx.ret_code, 0) + for handle in handles: + handle.close() + + def test_xfunction(self): + """Test xtrigger function import.""" + with TemporaryDirectory() as temp_dir: + python_dir = Path(temp_dir, "lib", "python") + python_dir.mkdir(parents=True) + the_answer_file = python_dir / "the_answer.py" + with the_answer_file.open(mode="w") as f: + f.write("""the_answer = lambda: 42""") + f.flush() + f_name = "the_answer" + fn = get_xtrig_func(f_name, f_name, temp_dir) + result = fn() + self.assertEqual(42, result) + + def test_xfunction_cache(self): + """Test xtrigger function import cache.""" + with TemporaryDirectory() as temp_dir: + python_dir = Path(temp_dir, "lib", "python") + python_dir.mkdir(parents=True) + amandita_file = python_dir / "amandita.py" + with amandita_file.open(mode="w") as f: + f.write("""choco = lambda: 'chocolate'""") + f.flush() + m_name = "amandita" # module + f_name = "choco" # function + fn = get_xtrig_func(m_name, f_name, temp_dir) + result = fn() + self.assertEqual('chocolate', result) + + # is in the cache + self.assertTrue((m_name, f_name) in _XTRIG_FUNC_CACHE) + # returned from cache + self.assertEqual(fn, get_xtrig_func(m_name, f_name, temp_dir)) + + def test_xfunction_import_error(self): + """Test for error on importing a xtrigger function. + + To prevent the test eventually failing if the test function is added + and successfully imported, we use an invalid module name as per Python + spec. + """ + with TemporaryDirectory() as temp_dir, self.assertRaises( + ModuleNotFoundError + ): get_xtrig_func("invalid-module-name", "func-name", temp_dir) - -def test_xfunction_attribute_error(): - """Test for error on looking for an attribute in a xtrigger script.""" - with TemporaryDirectory() as temp_dir: - python_dir = Path(temp_dir, "lib", "python") - python_dir.mkdir(parents=True) - the_answer_file = python_dir / "the_sword.py" - with the_answer_file.open(mode="w") as f: - f.write("""the_droid = lambda: 'excalibur'""") - f.flush() - f_name = "the_sword" - with pytest.raises(AttributeError): - get_xtrig_func(f_name, f_name, temp_dir) + def test_xfunction_attribute_error(self): + """Test for error on looking for an attribute in a xtrigger script.""" + with TemporaryDirectory() as temp_dir: + python_dir = Path(temp_dir, "lib", "python") + python_dir.mkdir(parents=True) + the_answer_file = python_dir / "the_sword.py" + with the_answer_file.open(mode="w") as f: + f.write("""the_droid = lambda: 'excalibur'""") + f.flush() + f_name = "the_sword" + with self.assertRaises(AttributeError): + get_xtrig_func(f_name, f_name, temp_dir) @pytest.fixture def mock_ctx(): def inner_(ret_code=None, host=None, cmd_key=None, cmd=None): - """Provide a SimpleNamespace which looks like a ctx object.""" + """Provide a SimpleNamespace which looks like a ctx object. + """ inputs = locals() defaults = { - 'ret_code': 255, - 'host': 'mouse', - 'cmd_key': 'my-command', - 'cmd': ['bistromathic', 'take-off'], + 'ret_code': 255, 'host': 'mouse', 'cmd_key': 'my-command', + 'cmd': ['bistromathic', 'take-off'] } for key in inputs: if inputs[key] is None: @@ -250,10 +235,9 @@ def inner_(ret_code=None, host=None, cmd_key=None, cmd=None): timestamp=None, ret_code=inputs['ret_code'], host=inputs['host'], - cmd_key=inputs['cmd_key'], + cmd_key=inputs['cmd_key'] ) return ctx - yield inner_ @@ -276,19 +260,21 @@ def _test_callback_255(ctx, foo=''): 'platform: None - Could not connect to mouse.', 255, 'ssh', - id="return 255", + id="return 255" ), pytest.param( 'platform: localhost - Could not connect to mouse.', 255, TaskJobLogsRetrieveContext(['ssh', 'something'], None, None), - id="return 255 (log-ret)", - ), - ], + id="return 255 (log-ret)" + ) + ] ) -def test__run_command_exit(caplog, mock_ctx, expect, ret_code, cmd_key +def test__run_command_exit( + caplog, mock_ctx, expect, ret_code, cmd_key ): - """It runs a callback""" + """It runs a callback + """ ctx = mock_ctx(ret_code=ret_code, cmd_key=cmd_key, cmd=['ssh']) SubProcPool._run_command_exit( ctx, callback=_test_callback, callback_255=_test_callback_255 @@ -307,7 +293,9 @@ def test__run_command_exit_no_255_callback(caplog, mock_ctx): def test__run_command_exit_no_gettable_platform(caplog, mock_ctx): """It logs being unable to select a platform""" ret_ctx = TaskJobLogsRetrieveContext( - platform_name='rhenas', max_size=256, key='rhenas' + platform_name='rhenas', + max_size=256, + key='rhenas' ) ctx = mock_ctx(cmd_key=ret_ctx, cmd=['ssh'], ret_code=255) SubProcPool._run_command_exit(ctx, callback=_test_callback) @@ -322,19 +310,20 @@ def test__run_command_exit_no_255_args(caplog, mock_ctx): mock_ctx(cmd=['ssh', 'Zaphod']), callback=_test_callback, callback_args=['Zaphod'], - callback_255=_test_callback_255, + callback_255=_test_callback_255 ) assert '255' in caplog.records[1].msg def test__run_command_exit_add_to_badhosts(mock_ctx): - """It updates the list of badhosts""" + """It updates the list of badhosts + """ badhosts = {'foo', 'bar'} SubProcPool._run_command_exit( mock_ctx(cmd=['ssh']), bad_hosts=badhosts, callback=print, - callback_args=['Welcome to Magrathea'], + callback_args=['Welcome to Magrathea'] ) assert badhosts == {'foo', 'bar', 'mouse'} @@ -346,36 +335,32 @@ def test__run_command_exit_add_to_badhosts_log(caplog, mock_ctx): mock_ctx(cmd=['ssh']), bad_hosts=badhosts, callback=lambda x, t: print(str(x)), - callback_args=[ - TaskProxy( - Tokens('~u/w//c/t/2'), - SimpleNamespace( - name='t', - dependencies={}, - sequential='', - external_triggers=[], - xtrig_labels={}, - expiration_offset=None, - outputs={ - TASK_OUTPUT_SUBMITTED: [None, None], - TASK_OUTPUT_SUBMIT_FAILED: [None, None], - TASK_OUTPUT_SUCCEEDED: [None, None], - TASK_OUTPUT_FAILED: [None, None], - TASK_OUTPUT_EXPIRED: [None, None], - }, - graph_children={}, - rtconfig={'platform': 'foo'}, - ), - ISO8601Point('1990'), - ) - ], + callback_args=[TaskProxy( + Tokens('~u/w//c/t/2'), + SimpleNamespace( + name='t', dependencies={}, sequential='', + external_triggers=[], xtrig_labels={}, + expiration_offset=None, + outputs={ + TASK_OUTPUT_SUBMITTED: [None, None], + TASK_OUTPUT_SUBMIT_FAILED: [None, None], + TASK_OUTPUT_SUCCEEDED: [None, None], + TASK_OUTPUT_FAILED: [None, None], + TASK_OUTPUT_EXPIRED: [None, None], + }, + graph_children={}, rtconfig={'platform': 'foo'} + + ), + ISO8601Point('1990') + )] ) assert 'platform: foo' in caplog.records[0].message assert badhosts == {'foo', 'bar', 'mouse'} def test__run_command_exit_rsync_fails(mock_ctx): - """It updates the list of badhosts""" + """It updates the list of badhosts + """ badhosts = {'foo', 'bar'} ctx = mock_ctx(cmd=['rsync'], ret_code=42, cmd_key='file-install') SubProcPool._run_command_exit( @@ -386,10 +371,10 @@ def test__run_command_exit_rsync_fails(mock_ctx): { 'name': 'Magrathea', 'ssh command': 'ssh', - 'rsync command': 'rsync command', + 'rsync command': 'rsync command' }, 'Welcome to Magrathea', - ], + ] ) assert badhosts == {'foo', 'bar', 'mouse'} @@ -399,11 +384,12 @@ def test__run_command_exit_rsync_fails(mock_ctx): [ (True, {'cmd': ['ssh'], 'ret_code': 255}), (False, {'cmd': ['foo'], 'ret_code': 255}), - (False, {'cmd': ['ssh'], 'ret_code': 42}), - ], + (False, {'cmd': ['ssh'], 'ret_code': 42}) + ] ) def test_ssh_255_fail(mock_ctx, expect, ctx_kwargs): - """It knows when a ctx has failed""" + """It knows when a ctx has failed + """ output = SubProcPool.ssh_255_fail(mock_ctx(**ctx_kwargs)) assert output == expect @@ -415,10 +401,11 @@ def test_ssh_255_fail(mock_ctx, expect, ctx_kwargs): (True, {'cmd': ['rsync'], 'ret_code': 255, 'host': 'not_local'}), (False, {'cmd': ['make it-so'], 'ret_code': 255, 'host': 'not_local'}), (False, {'cmd': ['rsync'], 'ret_code': 125, 'host': 'localhost'}), - ], + ] ) def test_rsync_255_fail(mock_ctx, expect, ctx_kwargs): - """It knows when a ctx has failed""" + """It knows when a ctx has failed + """ output = SubProcPool.rsync_255_fail( mock_ctx(**ctx_kwargs), {'ssh command': 'ssh', 'rsync command': 'rsync command'}, From 339af9cc96e01a51570720e30bf51648cfa2798a Mon Sep 17 00:00:00 2001 From: Oliver Sanders Date: Thu, 13 Mar 2025 09:44:47 +0000 Subject: [PATCH 020/101] tests: convert unittest to pytest --- tests/unit/test_subprocpool.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/unit/test_subprocpool.py b/tests/unit/test_subprocpool.py index 6b686c27082..7f3d412abe6 100644 --- a/tests/unit/test_subprocpool.py +++ b/tests/unit/test_subprocpool.py @@ -75,14 +75,14 @@ def test_run_command_writes_to_out(self): self.assertEqual(ctx.out, 'pirate urrrr\n') self.assertEqual(ctx.ret_code, 0) - def test_run_command_writes_to_err(self): - """Test basic usage, command writes to STDERR""" - ctx = SubProcContext( - 'parrot2', ['bash', '-c', 'echo pirate errrr >&2']) - SubProcPool.run_command(ctx) - self.assertEqual(ctx.err, 'pirate errrr\n') - self.assertEqual(ctx.out, '') - self.assertEqual(ctx.ret_code, 0) + +def test_run_command_writes_to_err(): + """Test basic usage, command writes to STDERR""" + ctx = SubProcContext('parrot2', ['bash', '-c', 'echo pirate errrr >&2']) + SubProcPool.run_command(ctx) + assert ctx.err == 'pirate errrr\n' + assert ctx.out == '' + assert ctx.ret_code == 0 def test_run_command_with_stdin_from_str(self): """Test STDIN from string""" From 391559b30a1cb0849462d27b95874aa38712018f Mon Sep 17 00:00:00 2001 From: Oliver Sanders Date: Thu, 13 Mar 2025 09:53:36 +0000 Subject: [PATCH 021/101] tests/u: test_subprocpool.py::test_run_command_writes_to_err * Attempt to fix flaky test. * Cut out shell profile files to omit some spurious stderr. --- tests/unit/test_subprocpool.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/tests/unit/test_subprocpool.py b/tests/unit/test_subprocpool.py index 7f3d412abe6..3ffe50ef366 100644 --- a/tests/unit/test_subprocpool.py +++ b/tests/unit/test_subprocpool.py @@ -14,15 +14,17 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . +from pathlib import Path +from types import SimpleNamespace from tempfile import ( - NamedTemporaryFile, SpooledTemporaryFile, TemporaryFile, - TemporaryDirectory + NamedTemporaryFile, + SpooledTemporaryFile, + TemporaryFile, + TemporaryDirectory, ) -import unittest -import pytest -from pathlib import Path -from types import SimpleNamespace + +import pytest from cylc.flow import LOG from cylc.flow.id import Tokens @@ -78,9 +80,12 @@ def test_run_command_writes_to_out(self): def test_run_command_writes_to_err(): """Test basic usage, command writes to STDERR""" - ctx = SubProcContext('parrot2', ['bash', '-c', 'echo pirate errrr >&2']) + ctx = SubProcContext( + 'parrot2', + ['bash', '--noprofile', '--norc', '-c', 'echo pirate errrr >&2'] + ) SubProcPool.run_command(ctx) - assert ctx.err == 'pirate errrr\n' + assert 'pirate errrr\n' assert ctx.out == '' assert ctx.ret_code == 0 From b9c080cd4b2ab829797238e34c9de9254389116e Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Fri, 28 Mar 2025 11:19:46 +0000 Subject: [PATCH 022/101] Adding profiler unit tests --- cylc/flow/scripts/profiler.py | 144 +++++++++++++--------------- tests/unit/scripts/test_profiler.py | 70 ++++++++++++++ 2 files changed, 135 insertions(+), 79 deletions(-) diff --git a/cylc/flow/scripts/profiler.py b/cylc/flow/scripts/profiler.py index 060d4f67e39..fede9b5d884 100755 --- a/cylc/flow/scripts/profiler.py +++ b/cylc/flow/scripts/profiler.py @@ -16,7 +16,7 @@ # along with this program. If not, see . """cylc profiler [OPTIONS] -Profiler which periodically polls PBS cgroups to track +Profiler which periodically polls cgroups to track the resource usage of jobs running on the node. """ @@ -53,14 +53,14 @@ def get_option_parser() -> COP: @cli_function(get_option_parser) -def main(parser, options): +def main(parser: COP, options) -> None: """CLI main.""" # Register the stop_profiler function with the signal library signal.signal(signal.SIGINT, stop_profiler) signal.signal(signal.SIGHUP, stop_profiler) signal.signal(signal.SIGTERM, stop_profiler) - profile(options) + get_config(options) @dataclass @@ -77,40 +77,43 @@ def stop_profiler(*args): sys.exit(0) -def parse_memory_file(process): +def parse_memory_file(cgroup_memory_path): """Open the memory stat file and copy the appropriate data""" - with open(process.cgroup_memory_path, 'r') as f: + with open(cgroup_memory_path, 'r') as f: for line in f: return int(line) // 1024 -def parse_cpu_file(process, cgroup_version): +def parse_cpu_file(cgroup_cpu_path, cgroup_version): """Open the memory stat file and return the appropriate data""" if cgroup_version == 1: - with open(process.cgroup_cpu_path, 'r') as f: + with open(cgroup_cpu_path, 'r') as f: for line in f: if "usage_usec" in line: return int(RE_INT.findall(line)[0]) // 1000 elif cgroup_version == 2: - with open(process.cgroup_cpu_path, 'r') as f: + with open(cgroup_cpu_path, 'r') as f: for line in f: # Cgroups v2 uses nanoseconds return int(line) / 1000000 - else: - raise FileNotFoundError("cpu usage files not found") def write_data(data, filename): - try: - with open(filename, 'w') as f: - f.write(data + "\n") - except IOError as err: - raise IOError("Unable to write data to file:" + filename) from err + with open(filename, 'w') as f: + f.write(data + "\n") + +def get_cgroup_version(cgroup_location: Path, cgroup_name: Path) -> int: + # HPC uses cgroups v2 and SPICE uses cgroups v1 + if Path.exists(Path(cgroup_location + cgroup_name)): + return 1 + elif Path.exists(Path(cgroup_location + "/memory" + cgroup_name)): + return 2 -def get_cgroup_dir(): + +def get_cgroup_name(): """Get the cgroup directory for the current process""" # Get the PID of the current process pid = os.getpid() @@ -121,79 +124,62 @@ def get_cgroup_dir(): result = PID_REGEX.search(result).group() return result except FileNotFoundError as err: - print(err) - print('/proc/' + str(pid) + '/cgroup not found') - exit() + raise FileNotFoundError( + '/proc/' + str(pid) + '/cgroup not found') from err + except AttributeError as err: - print(err) - print("No cgroup found for process") - exit() + raise AttributeError("No cgroup found for process:", pid) from err -def profile(args): - # Find the cgroup that this process is running in. - # Cylc will put this profiler in the same cgroup - # as the job it is profiling - cgroup_name = get_cgroup_dir() +def get_cgroup_paths(version, location, name): - # HPC uses cgroups v2 and SPICE uses cgroups v1 - cgroup_version = None + if version == 1: + return Process( + cgroup_memory_path=location + + name + "/" + "memory.peak", + cgroup_cpu_path=location + + name + "/" + "cpu.stat") + + elif version == 2: + return Process( + cgroup_memory_path=location + "/memory" + + name + "/memory.max_usage_in_bytes", + cgroup_cpu_path=location + "/cpu" + + name + "/cpuacct.usage") - if Path.exists(Path(args.cgroup_location + cgroup_name)): - cgroup_version = 1 - elif Path.exists(Path(args.cgroup_location + "/memory" + cgroup_name)): - cgroup_version = 2 - else: - raise FileNotFoundError("cgroups not found:" + cgroup_name) +def profile(process, version, delay, keep_looping=lambda: True): + # The infinite loop that will constantly poll the cgroup + # The lambda function is used to allow the loop to be stopped in unit tests peak_memory = 0 - processes = [] + while keep_looping(): + # Write cpu / memory usage data to disk + cpu_time = parse_cpu_file(process.cgroup_cpu_path, version) + write_data(str(cpu_time), "cpu_time") - if cgroup_version == 1: - try: - processes.append(Process( - cgroup_memory_path=args.cgroup_location + - cgroup_name + "/" + "memory.peak", - cgroup_cpu_path=args.cgroup_location + - cgroup_name + "/" + "cpu.stat")) - except FileNotFoundError as err: - print(err) - raise FileNotFoundError("cgroups not found:" - + args.cgroup_location) from err - elif cgroup_version == 2: - try: - processes.append(Process( - cgroup_memory_path=args.cgroup_location + "/memory" + - cgroup_name + "/memory.max_usage_in_bytes", - cgroup_cpu_path=args.cgroup_location + "/cpu" + - cgroup_name + "/cpuacct.usage")) - except FileNotFoundError as err: - print(err) - raise FileNotFoundError("cgroups not found:" + - args.cgroup_location) from err - - while True: - failures = 0 - # Write memory usage data - for process in processes: - # Only save Max RSS to disk if it is above the previous value - try: - memory = parse_memory_file(process) - if memory > peak_memory: - peak_memory = memory - write_data(str(peak_memory), "max_rss") - cpu_time = parse_cpu_file(process, cgroup_version) - write_data(str(cpu_time), "cpu_time") - - except (OSError, ValueError) as error: - failures += 1 - if failures > 5: - raise OSError("cgroup polling failure", error) from error - - time.sleep(args.delay) + memory = parse_memory_file(process.cgroup_memory_path) + # Only save Max RSS to disk if it is above the previous value + if memory > peak_memory: + peak_memory = memory + write_data(str(peak_memory), "max_rss") + + time.sleep(delay) + + +def get_config(args): + # Find the cgroup that this process is running in. + # Cylc will put this profiler in the same cgroup + # as the job it is profiling + cgroup_name = get_cgroup_name() + cgroup_version = get_cgroup_version(args.cgroup_location, cgroup_name) + process = get_cgroup_paths(cgroup_version, + args.cgroups_location, + cgroup_name) + + profile(process, cgroup_version, args.delay) if __name__ == "__main__": arg_parser = get_option_parser() - profile(arg_parser.parse_args([])) + get_config(arg_parser.parse_args([])) diff --git a/tests/unit/scripts/test_profiler.py b/tests/unit/scripts/test_profiler.py index 4aa49a22046..b79c5ae8a1e 100644 --- a/tests/unit/scripts/test_profiler.py +++ b/tests/unit/scripts/test_profiler.py @@ -208,3 +208,73 @@ def test_stop_profiler(): stop_profiler() assert pytest_wrapped_e.type == SystemExit assert pytest_wrapped_e.value.code == 0 + + +def test_get_cgroup_version(mocker): + + # Mock the Path.exists function call to return True + mocker.patch("pathlib.Path.exists", return_value=True) + assert get_cgroup_version('stuff/in/place', + 'more_stuff') == 1 + + with mock.patch('pathlib.Path.exists', side_effect=[False, True]): + assert get_cgroup_version('stuff/in/place', + 'more_stuff') == 2 + + # Mock the Path.exists function call to return False + mocker.patch("pathlib.Path.exists", return_value=False) + assert get_cgroup_version('stuff/in/other/place', + 'things') is None + + +def test_get_cgroup_paths(): + + process = get_cgroup_paths(1, "test_location/", + "test_name") + assert process.cgroup_memory_path == "test_location/test_name/memory.peak" + assert process.cgroup_cpu_path == "test_location/test_name/cpu.stat" + + process = get_cgroup_paths(2, "test_location", + "/test_name") + assert (process.cgroup_memory_path == + "test_location/memory/test_name/memory.max_usage_in_bytes") + assert (process.cgroup_cpu_path == + "test_location/cpu/test_name/cpuacct.usage") + + +def test_profile_cpu(mocker): + process = get_cgroup_paths(1, "test_location/", + "test_name") + + mock_file = mocker.mock_open(read_data="") + mocker.patch("builtins.open", mock_file) + mocker.patch("cylc.flow.scripts.profiler.parse_memory_file", + return_value=0) + mocker.patch("cylc.flow.scripts.profiler.parse_cpu_file", + return_value=2048) + run_once = mock.Mock(side_effect=[True, False]) + profile(process, 1, 1, run_once) + mock_file.assert_called_with("cpu_time", "w") + + +def test_profile_max_rss(mocker): + process = get_cgroup_paths(1, + "test_location/", + "test_name") + + mock_file = mocker.mock_open(read_data="") + mocker.patch("builtins.open", mock_file) + mocker.patch("cylc.flow.scripts.profiler.parse_memory_file", + return_value=1024) + mocker.patch("cylc.flow.scripts.profiler.parse_cpu_file", + return_value=2048) + run_once = mock.Mock(side_effect=[True, False]) + profile(process, 1, 1, run_once) + mock_file.assert_called_with("max_rss", "w") + + +def test_stop_profiler(): + with pytest.raises(SystemExit) as pytest_wrapped_e: + stop_profiler() + assert pytest_wrapped_e.type == SystemExit + assert pytest_wrapped_e.value.code == 0 From 7091711ef5f87fcae79bcdbe7b6302d325d6af1a Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Tue, 1 Apr 2025 15:41:54 +0100 Subject: [PATCH 023/101] adding pycharm files to .gitignore file --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index b18c09ad1eb..f7e26153db0 100644 --- a/.gitignore +++ b/.gitignore @@ -11,6 +11,9 @@ __pycache__/ # vscode .vscode +# pycharm +.idea + # processed workflow configs *.rc.processed *.cylc.processed From ad6b3a1334a1f339b15b56bf7bfe186bbb5777d4 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Wed, 2 Apr 2025 10:07:26 +0100 Subject: [PATCH 024/101] Fixing my terrible rebasing --- .github/workflows/1_create_release_pr.yml | 13 + .github/workflows/2_auto_publish_release.yml | 11 + .github/workflows/bash.yml | 8 + .github/workflows/build.yml | 34 +- .github/workflows/test_conda-build.yml | 7 + .github/workflows/test_fast.yml | 7 +- .github/workflows/test_functional.yml | 4 + .github/workflows/test_tutorial_workflow.yml | 8 + cylc/flow/cfgspec/globalcfg.py | 46 +- cylc/flow/host_select.py | 11 +- .../43-auto-restart-force-override-normal.t | 5 +- tests/unit/scripts/test_profiler.py | 123 +----- tests/unit/test_job_file.py | 10 +- tests/unit/test_subprocpool.py | 406 +++++++++--------- 14 files changed, 347 insertions(+), 346 deletions(-) diff --git a/.github/workflows/1_create_release_pr.yml b/.github/workflows/1_create_release_pr.yml index 10a700b7ab4..db8374e0b54 100644 --- a/.github/workflows/1_create_release_pr.yml +++ b/.github/workflows/1_create_release_pr.yml @@ -11,6 +11,19 @@ on: required: false default: 'master' +concurrency: + # Only let this run 1 at a time + group: ${{ github.workflow }} + cancel-in-progress: false + +defaults: + run: + shell: bash + +env: + FORCE_COLOR: 2 + PIP_PROGRESS_BAR: off + jobs: create-release-pr: runs-on: ubuntu-latest diff --git a/.github/workflows/2_auto_publish_release.yml b/.github/workflows/2_auto_publish_release.yml index 943c54090d9..a2b84a46c4d 100644 --- a/.github/workflows/2_auto_publish_release.yml +++ b/.github/workflows/2_auto_publish_release.yml @@ -7,7 +7,18 @@ on: # NOTE: While this is too generic, we use the `if` condition of the job to narrow it down # NOTE: Don't use `branches` as we might create release on any branch +concurrency: + # Only let this run 1 at a time + group: ${{ github.workflow }} + cancel-in-progress: false + +defaults: + run: + shell: bash + env: + FORCE_COLOR: 2 + PIP_PROGRESS_BAR: off # Best not to include the GH token here, only do it for the steps that need it MERGE_SHA: ${{ github.event.pull_request.merge_commit_sha }} CHANGELOG_FILE: CHANGES.md diff --git a/.github/workflows/bash.yml b/.github/workflows/bash.yml index b7c97cd21e9..d0970de9778 100644 --- a/.github/workflows/bash.yml +++ b/.github/workflows/bash.yml @@ -31,6 +31,14 @@ concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true +defaults: + run: + shell: bash + +env: + FORCE_COLOR: 2 + PIP_PROGRESS_BAR: off + jobs: bash-docker: runs-on: ubuntu-latest diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 2f09c9cbf39..6d12e5b1145 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -10,6 +10,20 @@ on: - 'MANIFEST.in' # check packaging - 'pyproject.toml' # check build config - 'setup.cfg' # check deps and project config + - '.gitignore' + - '.github/workflows/build.yml' + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +defaults: + run: + shell: bash -leo pipefail {0} + +env: + FORCE_COLOR: 2 + PIP_PROGRESS_BAR: off jobs: test: @@ -18,21 +32,23 @@ jobs: strategy: fail-fast: false matrix: - os: ['ubuntu-latest'] - python: ['3.8', '3.9', '3.10', '3.11'] - include: - - os: 'ubuntu-22.04' - python: '3.7' + os: ['ubuntu-latest', 'macos-latest'] + python: ['3.7', '3.8', '3.9', '3.10', '3'] + exclude: - os: 'macos-latest' - python: '3.8' + python: '3.7' steps: - name: Checkout uses: actions/checkout@v4 - name: Setup Python - uses: actions/setup-python@v5 + uses: mamba-org/setup-micromamba@v2 with: - python-version: ${{ matrix.python }} + cache-environment: true + post-cleanup: 'all' + environment-name: cylc-build + create-args: >- + python=${{ matrix.python }} - name: Build uses: cylc/release-actions/build-python-package@v1 @@ -40,7 +56,7 @@ jobs: - name: Inspect run: | unzip -l dist/*.whl | tee files - grep 'cylc_flow.*.dist-info/COPYING' files + grep -E 'cylc_flow.*.dist-info/.*COPYING' files grep 'cylc/flow/py.typed' files grep 'cylc/flow/etc' files grep 'cylc/flow/etc/cylc-completion.bash' files diff --git a/.github/workflows/test_conda-build.yml b/.github/workflows/test_conda-build.yml index b4e97117b1a..619981f2ac4 100644 --- a/.github/workflows/test_conda-build.yml +++ b/.github/workflows/test_conda-build.yml @@ -13,6 +13,13 @@ concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true +defaults: + run: + shell: bash + +env: + FORCE_COLOR: 2 + jobs: test_conda_install: if: github.repository_owner == 'cylc' || github.event_name != 'schedule' diff --git a/.github/workflows/test_fast.yml b/.github/workflows/test_fast.yml index 9847b71a454..95e8a5754c9 100644 --- a/.github/workflows/test_fast.yml +++ b/.github/workflows/test_fast.yml @@ -16,6 +16,9 @@ defaults: run: shell: bash -c "exec $CONDA_PREFIX/bin/bash -elo pipefail {0}" +env: + PIP_PROGRESS_BAR: off + jobs: test: runs-on: ${{ matrix.os }} @@ -32,11 +35,9 @@ jobs: - os: 'ubuntu-latest' python-version: '3.9' # not the oldest, not the most recent version time-zone: 'XXX-09:35' - env: TZ: ${{ matrix.time-zone }} PYTEST_ADDOPTS: --cov --cov-append -n 5 --color=yes - steps: - name: Checkout uses: actions/checkout@v4 @@ -106,6 +107,8 @@ jobs: strategy: matrix: python-version: ['3'] + env: + FORCE_COLOR: 2 steps: - name: Checkout uses: actions/checkout@v4 diff --git a/.github/workflows/test_functional.yml b/.github/workflows/test_functional.yml index aa7ebb8de60..d85d3a60c9b 100644 --- a/.github/workflows/test_functional.yml +++ b/.github/workflows/test_functional.yml @@ -36,6 +36,10 @@ defaults: run: shell: bash -c "exec $CONDA_PREFIX/bin/bash -elo pipefail {0}" +env: + FORCE_COLOR: 2 + PIP_PROGRESS_BAR: off + jobs: test: runs-on: ${{ matrix.os }} diff --git a/.github/workflows/test_tutorial_workflow.yml b/.github/workflows/test_tutorial_workflow.yml index 3faa8469ef4..01808ca4871 100644 --- a/.github/workflows/test_tutorial_workflow.yml +++ b/.github/workflows/test_tutorial_workflow.yml @@ -17,6 +17,14 @@ concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true +defaults: + run: + shell: bash + +env: + FORCE_COLOR: 2 + PIP_PROGRESS_BAR: off + jobs: test: strategy: diff --git a/cylc/flow/cfgspec/globalcfg.py b/cylc/flow/cfgspec/globalcfg.py index 6d6dc23f717..821b3fd3d1f 100644 --- a/cylc/flow/cfgspec/globalcfg.py +++ b/cylc/flow/cfgspec/globalcfg.py @@ -826,16 +826,52 @@ def default_for( range. ''') Conf('condemned', VDR.V_ABSOLUTE_HOST_LIST, desc=f''' - These hosts will not be used to run jobs. + List run hosts that workflows should *not* run on. - If workflows are already running on - condemned hosts, Cylc will shut them down and - restart them on different hosts. + These hosts will be subtracted from the + `available ` hosts + preventing new workflows from starting on the "condemned" host. + + Any workflows running on these hosts will either migrate + to another host, or shut down according to + :py:mod:`the configuration `. + + This feature requires ``auto restart`` to be listed + in `global.cylc[scheduler][main loop]plugins`. + + For more information, see the + :py:mod:`auto restart ` + plugin. + + .. rubric:: Example: + + .. code-block:: cylc + + [scheduler] + [[main loop]] + # activate the "auto restart" plugin + plugins = auto restart + [[run hosts]] + # there are three hosts in the "pool" + available = host1, host2, host3 + + # however two have been taken out: + # * workflows running on "host1" will attempt to + # restart on "host3" + # * workflows running on "host2" will shutdown + condemned = host1, host2! .. seealso:: + :py:mod:`cylc.flow.main_loop.auto_restart` :ref:`auto-stop-restart` + .. versionchanged:: 8.4.2 + + The "force mode" (activated by a "!" suffix) caused issues + at workflow startup for Cylc versions between 8.0.0 and + 8.4.1 inclusive. + .. versionchanged:: 8.0.0 {REPLACES}``[suite servers]condemned hosts``. @@ -1345,7 +1381,7 @@ def default_for( The means by which task progress messages are reported back to the running workflow. - ..rubric:: Options: + .. rubric:: Options: zmq Direct client-server TCP communication via network ports diff --git a/cylc/flow/host_select.py b/cylc/flow/host_select.py index 69e32c68a71..cf940864b90 100644 --- a/cylc/flow/host_select.py +++ b/cylc/flow/host_select.py @@ -128,6 +128,13 @@ def select_workflow_host(cached=True): # be returned with the up-to-date configuration. global_config = glbl_cfg(cached=cached) + # condemned hosts may be suffixed with an "!" to activate "force mode" + blacklist = [] + for host in global_config.get(['scheduler', 'run hosts', 'condemned'], []): + if host.endswith('!'): + host = host[:-1] + blacklist.append(host) + return select_host( # list of workflow hosts global_config.get([ @@ -138,9 +145,7 @@ def select_workflow_host(cached=True): 'scheduler', 'run hosts', 'ranking' ]), # list of condemned hosts - blacklist=global_config.get( - ['scheduler', 'run hosts', 'condemned'] - ), + blacklist=blacklist, blacklist_name='condemned host' ) diff --git a/tests/functional/restart/43-auto-restart-force-override-normal.t b/tests/functional/restart/43-auto-restart-force-override-normal.t index b61d08c68cb..35edc57d1f9 100644 --- a/tests/functional/restart/43-auto-restart-force-override-normal.t +++ b/tests/functional/restart/43-auto-restart-force-override-normal.t @@ -50,7 +50,10 @@ create_test_global_config '' " ${BASE_GLOBAL_CONFIG} [scheduler] [[run hosts]] - available = ${CYLC_TEST_HOST_1} + available = ${CYLC_TEST_HOST_1}, ${CYLC_TEST_HOST_2} + # ensure the workflow can start if a host is condemned + # in force mode see #6623 + condemned = ${CYLC_TEST_HOST_2}! " set_test_number 8 diff --git a/tests/unit/scripts/test_profiler.py b/tests/unit/scripts/test_profiler.py index b79c5ae8a1e..54398fbee7c 100644 --- a/tests/unit/scripts/test_profiler.py +++ b/tests/unit/scripts/test_profiler.py @@ -51,7 +51,8 @@ def test_parse_cpu_file(mocker): mock_file = mocker.mock_open(read_data="usage_usec 1000000") mocker.patch("builtins.open", mock_file) - assert parse_cpu_file("mocked_file.txt", 1) == 1000 + assert parse_cpu_file( + "mocked_file.txt", 1) == 1000 mock_file.assert_called_once_with("mocked_file.txt", "r") mock_file = mocker.mock_open(read_data="1000000") @@ -137,72 +138,6 @@ def test_profile_cpu(mocker): mock_file.assert_called_with("cpu_time", "w") -def test_profile_max_rss(mocker): - process = get_cgroup_paths(1, - "test_location/", - "test_name") - - mock_file = mocker.mock_open(read_data="") - mocker.patch("builtins.open", mock_file) - mocker.patch("cylc.flow.scripts.profiler.parse_memory_file", - return_value=1024) - mocker.patch("cylc.flow.scripts.profiler.parse_cpu_file", - return_value=2048) - run_once = mock.Mock(side_effect=[True, False]) - profile(process, 1, 1, run_once) - mock_file.assert_called_with("max_rss", "w") - - -def test_get_cgroup_version(mocker): - - # Mock the Path.exists function call to return True - mocker.patch("pathlib.Path.exists", return_value=True) - assert get_cgroup_version('stuff/in/place', 'more_stuff') == 1 - - with mock.patch('pathlib.Path.exists', side_effect=[False, True]): - assert get_cgroup_version('stuff/in/place', 'more_stuff') == 2 - - # Mock the Path.exists function call to return False - mocker.patch("pathlib.Path.exists", return_value=False) - assert get_cgroup_version('stuff/in/other/place', 'things') is None - - -def test_get_cgroup_paths(): - - process = get_cgroup_paths(1, "test_location/", "test_name") - assert process.cgroup_memory_path == "test_location/test_name/memory.peak" - assert process.cgroup_cpu_path == "test_location/test_name/cpu.stat" - - process = get_cgroup_paths(2, "test_location", "/test_name") - assert (process.cgroup_memory_path == - "test_location/memory/test_name/memory.max_usage_in_bytes") - assert process.cgroup_cpu_path == "test_location/cpu/test_name/cpuacct.usage" - - -def test_profile_cpu(mocker): - process = get_cgroup_paths(1, "test_location/", "test_name") - - mock_file = mocker.mock_open(read_data="") - mocker.patch("builtins.open", mock_file) - mocker.patch("cylc.flow.scripts.profiler.parse_memory_file", return_value=0) - mocker.patch("cylc.flow.scripts.profiler.parse_cpu_file", return_value=2048) - run_once = mock.Mock(side_effect=[True, False]) - profile(process, 1, 1, run_once) - mock_file.assert_called_with("cpu_time", "w") - - -def test_profile_max_rss(mocker): - process = get_cgroup_paths(1, "test_location/", "test_name") - - mock_file = mocker.mock_open(read_data="") - mocker.patch("builtins.open", mock_file) - mocker.patch("cylc.flow.scripts.profiler.parse_memory_file", return_value=1024) - mocker.patch("cylc.flow.scripts.profiler.parse_cpu_file", return_value=2048) - run_once = mock.Mock(side_effect=[True, False]) - profile(process, 1, 1, run_once) - mock_file.assert_called_with("max_rss", "w") - - def test_stop_profiler(): with pytest.raises(SystemExit) as pytest_wrapped_e: stop_profiler() @@ -210,53 +145,6 @@ def test_stop_profiler(): assert pytest_wrapped_e.value.code == 0 -def test_get_cgroup_version(mocker): - - # Mock the Path.exists function call to return True - mocker.patch("pathlib.Path.exists", return_value=True) - assert get_cgroup_version('stuff/in/place', - 'more_stuff') == 1 - - with mock.patch('pathlib.Path.exists', side_effect=[False, True]): - assert get_cgroup_version('stuff/in/place', - 'more_stuff') == 2 - - # Mock the Path.exists function call to return False - mocker.patch("pathlib.Path.exists", return_value=False) - assert get_cgroup_version('stuff/in/other/place', - 'things') is None - - -def test_get_cgroup_paths(): - - process = get_cgroup_paths(1, "test_location/", - "test_name") - assert process.cgroup_memory_path == "test_location/test_name/memory.peak" - assert process.cgroup_cpu_path == "test_location/test_name/cpu.stat" - - process = get_cgroup_paths(2, "test_location", - "/test_name") - assert (process.cgroup_memory_path == - "test_location/memory/test_name/memory.max_usage_in_bytes") - assert (process.cgroup_cpu_path == - "test_location/cpu/test_name/cpuacct.usage") - - -def test_profile_cpu(mocker): - process = get_cgroup_paths(1, "test_location/", - "test_name") - - mock_file = mocker.mock_open(read_data="") - mocker.patch("builtins.open", mock_file) - mocker.patch("cylc.flow.scripts.profiler.parse_memory_file", - return_value=0) - mocker.patch("cylc.flow.scripts.profiler.parse_cpu_file", - return_value=2048) - run_once = mock.Mock(side_effect=[True, False]) - profile(process, 1, 1, run_once) - mock_file.assert_called_with("cpu_time", "w") - - def test_profile_max_rss(mocker): process = get_cgroup_paths(1, "test_location/", @@ -271,10 +159,3 @@ def test_profile_max_rss(mocker): run_once = mock.Mock(side_effect=[True, False]) profile(process, 1, 1, run_once) mock_file.assert_called_with("max_rss", "w") - - -def test_stop_profiler(): - with pytest.raises(SystemExit) as pytest_wrapped_e: - stop_profiler() - assert pytest_wrapped_e.type == SystemExit - assert pytest_wrapped_e.value.code == 0 diff --git a/tests/unit/test_job_file.py b/tests/unit/test_job_file.py index 88a177fa0ac..617d02fc1e1 100644 --- a/tests/unit/test_job_file.py +++ b/tests/unit/test_job_file.py @@ -65,13 +65,11 @@ def fixture_get_platform(): Returns: platforms dictionary. """ - def inner_func(custom_settings=None): platform = platform_from_name() if custom_settings is not None: platform.update(custom_settings) return platform - yield inner_func @@ -161,9 +159,9 @@ def test_write(fixture_get_platform): "execution_time_limit": 60 }, ('\n\n# DIRECTIVES:\n# @ job_name = farm_noises.baa.1' - '\n# @ output = directory/job.out\n# @ error = directory/' - 'job.err\n# @ wall_clock_limit = 120,60\n# @ moo = foo' - '\n# @ cluck = bar\n# @ queue') + '\n# @ output = directory/job.out\n# @ error = directory/' + 'job.err\n# @ wall_clock_limit = 120,60\n# @ moo = foo' + '\n# @ cluck = bar\n# @ queue') ), ( # Check no directives is correctly written @@ -219,6 +217,7 @@ def test_write(fixture_get_platform): "job_d": "1/test_task_id/01", "job_file_path": "$HOME/directory/job", "execution_time_limit": 1000 + }, ('\n\n# DIRECTIVES:\n#$ -N farm_noises.baa.1\n#$ -o directory/' 'job.out\n#$ -e directory/job.err\n#$ -l h_rt=0:16:40\n#$ -V\n#' @@ -384,6 +383,7 @@ def test_no_script_section_with_comment_only_script(): } with io.StringIO() as fake_file: + JobFileWriter()._write_script(fake_file, job_conf) blah = fake_file.getvalue() print(blah) diff --git a/tests/unit/test_subprocpool.py b/tests/unit/test_subprocpool.py index 3ffe50ef366..b4fe684a5e9 100644 --- a/tests/unit/test_subprocpool.py +++ b/tests/unit/test_subprocpool.py @@ -23,7 +23,6 @@ TemporaryDirectory, ) - import pytest from cylc.flow import LOG @@ -46,36 +45,36 @@ from cylc.flow.task_proxy import TaskProxy -class TestSubProcPool(unittest.TestCase): +def test_get_temporary_file(): + """Test SubProcPool.get_temporary_file.""" + assert isinstance(SubProcPool.get_temporary_file(), SpooledTemporaryFile) + + +def test_run_command_returns_0(): + """Test basic usage, command returns 0""" + ctx = SubProcContext('truth', ['true']) + SubProcPool.run_command(ctx) + assert ctx.err == '' + assert ctx.out == '' + assert ctx.ret_code == 0 - def test_get_temporary_file(self): - """Test SubProcPool.get_temporary_file.""" - self.assertIsInstance( - SubProcPool.get_temporary_file(), SpooledTemporaryFile) - def test_run_command_returns_0(self): - """Test basic usage, command returns 0""" - ctx = SubProcContext('truth', ['true']) - SubProcPool.run_command(ctx) - self.assertEqual(ctx.err, '') - self.assertEqual(ctx.out, '') - self.assertEqual(ctx.ret_code, 0) +def test_run_command_returns_1(): + """Test basic usage, command returns 1""" + ctx = SubProcContext('lies', ['false']) + SubProcPool.run_command(ctx) + assert ctx.err == '' + assert ctx.out == '' + assert ctx.ret_code == 1 - def test_run_command_returns_1(self): - """Test basic usage, command returns 1""" - ctx = SubProcContext('lies', ['false']) - SubProcPool.run_command(ctx) - self.assertEqual(ctx.err, '') - self.assertEqual(ctx.out, '') - self.assertEqual(ctx.ret_code, 1) - def test_run_command_writes_to_out(self): - """Test basic usage, command writes to STDOUT""" - ctx = SubProcContext('parrot', ['echo', 'pirate', 'urrrr']) - SubProcPool.run_command(ctx) - self.assertEqual(ctx.err, '') - self.assertEqual(ctx.out, 'pirate urrrr\n') - self.assertEqual(ctx.ret_code, 0) +def test_run_command_writes_to_out(): + """Test basic usage, command writes to STDOUT""" + ctx = SubProcContext('parrot', ['echo', 'pirate', 'urrrr']) + SubProcPool.run_command(ctx) + assert ctx.err == '' + assert ctx.out == 'pirate urrrr\n' + assert ctx.ret_code == 0 def test_run_command_writes_to_err(): @@ -89,148 +88,158 @@ def test_run_command_writes_to_err(): assert ctx.out == '' assert ctx.ret_code == 0 - def test_run_command_with_stdin_from_str(self): - """Test STDIN from string""" - ctx = SubProcContext('meow', ['cat'], stdin_str='catches mice.\n') - SubProcPool.run_command(ctx) - self.assertEqual(ctx.err, '') - self.assertEqual(ctx.out, 'catches mice.\n') - self.assertEqual(ctx.ret_code, 0) - - def test_run_command_with_stdin_from_unicode(self): - """Test STDIN from string with Unicode""" - ctx = SubProcContext('meow', ['cat'], stdin_str='喵\n') - SubProcPool.run_command(ctx) - self.assertEqual(ctx.err, '') - self.assertEqual(ctx.out, '喵\n') - self.assertEqual(ctx.ret_code, 0) - - def test_run_command_with_stdin_from_handle(self): - """Test STDIN from a single opened file handle""" + +def test_run_command_with_stdin_from_str(): + """Test STDIN from string""" + ctx = SubProcContext('meow', ['cat'], stdin_str='catches mice.\n') + SubProcPool.run_command(ctx) + assert ctx.err == '' + assert ctx.out == 'catches mice.\n' + assert ctx.ret_code == 0 + + +def test_run_command_with_stdin_from_unicode(): + """Test STDIN from string with Unicode""" + ctx = SubProcContext('meow', ['cat'], stdin_str='喵\n') + SubProcPool.run_command(ctx) + assert ctx.err == '' + assert ctx.out == '喵\n' + assert ctx.ret_code == 0 + + +def test_run_command_with_stdin_from_handle(): + """Test STDIN from a single opened file handle""" + handle = TemporaryFile() + handle.write('catches mice.\n'.encode('UTF-8')) + handle.seek(0) + ctx = SubProcContext('meow', ['cat'], stdin_files=[handle]) + SubProcPool.run_command(ctx) + assert ctx.err == '' + assert ctx.out == 'catches mice.\n' + assert ctx.ret_code == 0 + handle.close() + + +def test_run_command_with_stdin_from_path(): + """Test STDIN from a single file path""" + handle = NamedTemporaryFile() + handle.write('catches mice.\n'.encode('UTF-8')) + handle.seek(0) + ctx = SubProcContext('meow', ['cat'], stdin_files=[handle.name]) + SubProcPool.run_command(ctx) + assert ctx.err == '' + assert ctx.out == 'catches mice.\n' + assert ctx.ret_code == 0 + handle.close() + + +def test_run_command_with_stdin_from_handles(): + """Test STDIN from multiple file handles""" + handles = [] + for txt in ['catches mice.\n', 'eat fish.\n']: handle = TemporaryFile() - handle.write('catches mice.\n'.encode('UTF-8')) + handle.write(txt.encode('UTF-8')) handle.seek(0) - ctx = SubProcContext('meow', ['cat'], stdin_files=[handle]) - SubProcPool.run_command(ctx) - self.assertEqual(ctx.err, '') - self.assertEqual(ctx.out, 'catches mice.\n') - self.assertEqual(ctx.ret_code, 0) + handles.append(handle) + ctx = SubProcContext('meow', ['cat'], stdin_files=handles) + SubProcPool.run_command(ctx) + assert ctx.err == '' + assert ctx.out == 'catches mice.\neat fish.\n' + assert ctx.ret_code == 0 + for handle in handles: handle.close() - def test_run_command_with_stdin_from_path(self): - """Test STDIN from a single file path""" + +def test_run_command_with_stdin_from_paths(): + """Test STDIN from multiple file paths""" + handles = [] + for txt in ['catches mice.\n', 'eat fish.\n']: handle = NamedTemporaryFile() - handle.write('catches mice.\n'.encode('UTF-8')) + handle.write(txt.encode('UTF-8')) handle.seek(0) - ctx = SubProcContext('meow', ['cat'], stdin_files=[handle.name]) - SubProcPool.run_command(ctx) - self.assertEqual(ctx.err, '') - self.assertEqual(ctx.out, 'catches mice.\n') - self.assertEqual(ctx.ret_code, 0) + handles.append(handle) + ctx = SubProcContext( + 'meow', ['cat'], stdin_files=[handle.name for handle in handles] + ) + SubProcPool.run_command(ctx) + assert ctx.err == '' + assert ctx.out == 'catches mice.\neat fish.\n' + assert ctx.ret_code == 0 + for handle in handles: handle.close() - def test_run_command_with_stdin_from_handles(self): - """Test STDIN from multiple file handles""" - handles = [] - for txt in ['catches mice.\n', 'eat fish.\n']: - handle = TemporaryFile() - handle.write(txt.encode('UTF-8')) - handle.seek(0) - handles.append(handle) - ctx = SubProcContext('meow', ['cat'], stdin_files=handles) - SubProcPool.run_command(ctx) - self.assertEqual(ctx.err, '') - self.assertEqual(ctx.out, 'catches mice.\neat fish.\n') - self.assertEqual(ctx.ret_code, 0) - for handle in handles: - handle.close() - - def test_run_command_with_stdin_from_paths(self): - """Test STDIN from multiple file paths""" - handles = [] - for txt in ['catches mice.\n', 'eat fish.\n']: - handle = NamedTemporaryFile() - handle.write(txt.encode('UTF-8')) - handle.seek(0) - handles.append(handle) - ctx = SubProcContext( - 'meow', ['cat'], stdin_files=[handle.name for handle in handles]) - SubProcPool.run_command(ctx) - self.assertEqual(ctx.err, '') - self.assertEqual(ctx.out, 'catches mice.\neat fish.\n') - self.assertEqual(ctx.ret_code, 0) - for handle in handles: - handle.close() - - def test_xfunction(self): - """Test xtrigger function import.""" - with TemporaryDirectory() as temp_dir: - python_dir = Path(temp_dir, "lib", "python") - python_dir.mkdir(parents=True) - the_answer_file = python_dir / "the_answer.py" - with the_answer_file.open(mode="w") as f: - f.write("""the_answer = lambda: 42""") - f.flush() - f_name = "the_answer" - fn = get_xtrig_func(f_name, f_name, temp_dir) - result = fn() - self.assertEqual(42, result) - - def test_xfunction_cache(self): - """Test xtrigger function import cache.""" - with TemporaryDirectory() as temp_dir: - python_dir = Path(temp_dir, "lib", "python") - python_dir.mkdir(parents=True) - amandita_file = python_dir / "amandita.py" - with amandita_file.open(mode="w") as f: - f.write("""choco = lambda: 'chocolate'""") - f.flush() - m_name = "amandita" # module - f_name = "choco" # function - fn = get_xtrig_func(m_name, f_name, temp_dir) - result = fn() - self.assertEqual('chocolate', result) - - # is in the cache - self.assertTrue((m_name, f_name) in _XTRIG_FUNC_CACHE) - # returned from cache - self.assertEqual(fn, get_xtrig_func(m_name, f_name, temp_dir)) - - def test_xfunction_import_error(self): - """Test for error on importing a xtrigger function. - - To prevent the test eventually failing if the test function is added - and successfully imported, we use an invalid module name as per Python - spec. - """ - with TemporaryDirectory() as temp_dir, self.assertRaises( - ModuleNotFoundError - ): - get_xtrig_func("invalid-module-name", "func-name", temp_dir) - - def test_xfunction_attribute_error(self): - """Test for error on looking for an attribute in a xtrigger script.""" - with TemporaryDirectory() as temp_dir: - python_dir = Path(temp_dir, "lib", "python") - python_dir.mkdir(parents=True) - the_answer_file = python_dir / "the_sword.py" - with the_answer_file.open(mode="w") as f: - f.write("""the_droid = lambda: 'excalibur'""") - f.flush() - f_name = "the_sword" - with self.assertRaises(AttributeError): - get_xtrig_func(f_name, f_name, temp_dir) + +def test_xfunction(): + """Test xtrigger function import.""" + with TemporaryDirectory() as temp_dir: + python_dir = Path(temp_dir, "lib", "python") + python_dir.mkdir(parents=True) + the_answer_file = python_dir / "the_answer.py" + with the_answer_file.open(mode="w") as f: + f.write("""the_answer = lambda: 42""") + f.flush() + f_name = "the_answer" + fn = get_xtrig_func(f_name, f_name, temp_dir) + result = fn() + assert 42 == result + + +def test_xfunction_cache(): + """Test xtrigger function import cache.""" + with TemporaryDirectory() as temp_dir: + python_dir = Path(temp_dir, "lib", "python") + python_dir.mkdir(parents=True) + amandita_file = python_dir / "amandita.py" + with amandita_file.open(mode="w") as f: + f.write("""choco = lambda: 'chocolate'""") + f.flush() + m_name = "amandita" # module + f_name = "choco" # function + fn = get_xtrig_func(m_name, f_name, temp_dir) + result = fn() + assert 'chocolate' == result + + # is in the cache + assert (m_name, f_name) in _XTRIG_FUNC_CACHE + # returned from cache + assert fn, get_xtrig_func(m_name, f_name == temp_dir) + + +def test_xfunction_import_error(): + """Test for error on importing a xtrigger function. + + To prevent the test eventually failing if the test function is added + and successfully imported, we use an invalid module name as per Python + spec. + """ + with TemporaryDirectory() as temp_dir, pytest.raises(ModuleNotFoundError): + get_xtrig_func("invalid-module-name", "func-name", temp_dir) + + +def test_xfunction_attribute_error(): + """Test for error on looking for an attribute in a xtrigger script.""" + with TemporaryDirectory() as temp_dir: + python_dir = Path(temp_dir, "lib", "python") + python_dir.mkdir(parents=True) + the_answer_file = python_dir / "the_sword.py" + with the_answer_file.open(mode="w") as f: + f.write("""the_droid = lambda: 'excalibur'""") + f.flush() + f_name = "the_sword" + with pytest.raises(AttributeError): + get_xtrig_func(f_name, f_name, temp_dir) @pytest.fixture def mock_ctx(): def inner_(ret_code=None, host=None, cmd_key=None, cmd=None): - """Provide a SimpleNamespace which looks like a ctx object. - """ + """Provide a SimpleNamespace which looks like a ctx object.""" inputs = locals() defaults = { - 'ret_code': 255, 'host': 'mouse', 'cmd_key': 'my-command', - 'cmd': ['bistromathic', 'take-off'] + 'ret_code': 255, + 'host': 'mouse', + 'cmd_key': 'my-command', + 'cmd': ['bistromathic', 'take-off'], } for key in inputs: if inputs[key] is None: @@ -240,9 +249,10 @@ def inner_(ret_code=None, host=None, cmd_key=None, cmd=None): timestamp=None, ret_code=inputs['ret_code'], host=inputs['host'], - cmd_key=inputs['cmd_key'] + cmd_key=inputs['cmd_key'], ) return ctx + yield inner_ @@ -265,21 +275,18 @@ def _test_callback_255(ctx, foo=''): 'platform: None - Could not connect to mouse.', 255, 'ssh', - id="return 255" + id="return 255", ), pytest.param( 'platform: localhost - Could not connect to mouse.', 255, TaskJobLogsRetrieveContext(['ssh', 'something'], None, None), - id="return 255 (log-ret)" - ) - ] + id="return 255 (log-ret)", + ), + ], ) -def test__run_command_exit( - caplog, mock_ctx, expect, ret_code, cmd_key -): - """It runs a callback - """ +def test__run_command_exit(caplog, mock_ctx, expect, ret_code, cmd_key): + """It runs a callback""" ctx = mock_ctx(ret_code=ret_code, cmd_key=cmd_key, cmd=['ssh']) SubProcPool._run_command_exit( ctx, callback=_test_callback, callback_255=_test_callback_255 @@ -298,9 +305,7 @@ def test__run_command_exit_no_255_callback(caplog, mock_ctx): def test__run_command_exit_no_gettable_platform(caplog, mock_ctx): """It logs being unable to select a platform""" ret_ctx = TaskJobLogsRetrieveContext( - platform_name='rhenas', - max_size=256, - key='rhenas' + platform_name='rhenas', max_size=256, key='rhenas' ) ctx = mock_ctx(cmd_key=ret_ctx, cmd=['ssh'], ret_code=255) SubProcPool._run_command_exit(ctx, callback=_test_callback) @@ -315,20 +320,19 @@ def test__run_command_exit_no_255_args(caplog, mock_ctx): mock_ctx(cmd=['ssh', 'Zaphod']), callback=_test_callback, callback_args=['Zaphod'], - callback_255=_test_callback_255 + callback_255=_test_callback_255, ) assert '255' in caplog.records[1].msg def test__run_command_exit_add_to_badhosts(mock_ctx): - """It updates the list of badhosts - """ + """It updates the list of badhosts""" badhosts = {'foo', 'bar'} SubProcPool._run_command_exit( mock_ctx(cmd=['ssh']), bad_hosts=badhosts, callback=print, - callback_args=['Welcome to Magrathea'] + callback_args=['Welcome to Magrathea'], ) assert badhosts == {'foo', 'bar', 'mouse'} @@ -340,32 +344,36 @@ def test__run_command_exit_add_to_badhosts_log(caplog, mock_ctx): mock_ctx(cmd=['ssh']), bad_hosts=badhosts, callback=lambda x, t: print(str(x)), - callback_args=[TaskProxy( - Tokens('~u/w//c/t/2'), - SimpleNamespace( - name='t', dependencies={}, sequential='', - external_triggers=[], xtrig_labels={}, - expiration_offset=None, - outputs={ - TASK_OUTPUT_SUBMITTED: [None, None], - TASK_OUTPUT_SUBMIT_FAILED: [None, None], - TASK_OUTPUT_SUCCEEDED: [None, None], - TASK_OUTPUT_FAILED: [None, None], - TASK_OUTPUT_EXPIRED: [None, None], - }, - graph_children={}, rtconfig={'platform': 'foo'} - - ), - ISO8601Point('1990') - )] + callback_args=[ + TaskProxy( + Tokens('~u/w//c/t/2'), + SimpleNamespace( + name='t', + dependencies={}, + sequential='', + external_triggers=[], + xtrig_labels={}, + expiration_offset=None, + outputs={ + TASK_OUTPUT_SUBMITTED: [None, None], + TASK_OUTPUT_SUBMIT_FAILED: [None, None], + TASK_OUTPUT_SUCCEEDED: [None, None], + TASK_OUTPUT_FAILED: [None, None], + TASK_OUTPUT_EXPIRED: [None, None], + }, + graph_children={}, + rtconfig={'platform': 'foo'}, + ), + ISO8601Point('1990'), + ) + ], ) assert 'platform: foo' in caplog.records[0].message assert badhosts == {'foo', 'bar', 'mouse'} def test__run_command_exit_rsync_fails(mock_ctx): - """It updates the list of badhosts - """ + """It updates the list of badhosts""" badhosts = {'foo', 'bar'} ctx = mock_ctx(cmd=['rsync'], ret_code=42, cmd_key='file-install') SubProcPool._run_command_exit( @@ -376,10 +384,10 @@ def test__run_command_exit_rsync_fails(mock_ctx): { 'name': 'Magrathea', 'ssh command': 'ssh', - 'rsync command': 'rsync command' + 'rsync command': 'rsync command', }, 'Welcome to Magrathea', - ] + ], ) assert badhosts == {'foo', 'bar', 'mouse'} @@ -389,12 +397,11 @@ def test__run_command_exit_rsync_fails(mock_ctx): [ (True, {'cmd': ['ssh'], 'ret_code': 255}), (False, {'cmd': ['foo'], 'ret_code': 255}), - (False, {'cmd': ['ssh'], 'ret_code': 42}) - ] + (False, {'cmd': ['ssh'], 'ret_code': 42}), + ], ) def test_ssh_255_fail(mock_ctx, expect, ctx_kwargs): - """It knows when a ctx has failed - """ + """It knows when a ctx has failed""" output = SubProcPool.ssh_255_fail(mock_ctx(**ctx_kwargs)) assert output == expect @@ -406,11 +413,10 @@ def test_ssh_255_fail(mock_ctx, expect, ctx_kwargs): (True, {'cmd': ['rsync'], 'ret_code': 255, 'host': 'not_local'}), (False, {'cmd': ['make it-so'], 'ret_code': 255, 'host': 'not_local'}), (False, {'cmd': ['rsync'], 'ret_code': 125, 'host': 'localhost'}), - ] + ], ) def test_rsync_255_fail(mock_ctx, expect, ctx_kwargs): - """It knows when a ctx has failed - """ + """It knows when a ctx has failed""" output = SubProcPool.rsync_255_fail( mock_ctx(**ctx_kwargs), {'ssh command': 'ssh', 'rsync command': 'rsync command'}, From afcdd81280f5a492f1cf5c7cdb213791a82dbbd8 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Wed, 2 Apr 2025 13:37:59 +0100 Subject: [PATCH 025/101] MyPy Linting --- cylc/flow/scripts/profiler.py | 4 +++- tests/unit/scripts/test_profiler.py | 5 +++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/cylc/flow/scripts/profiler.py b/cylc/flow/scripts/profiler.py index fede9b5d884..b31c564638f 100755 --- a/cylc/flow/scripts/profiler.py +++ b/cylc/flow/scripts/profiler.py @@ -105,12 +105,14 @@ def write_data(data, filename): f.write(data + "\n") -def get_cgroup_version(cgroup_location: Path, cgroup_name: Path) -> int: +def get_cgroup_version(cgroup_location: str, cgroup_name: str) -> int: # HPC uses cgroups v2 and SPICE uses cgroups v1 if Path.exists(Path(cgroup_location + cgroup_name)): return 1 elif Path.exists(Path(cgroup_location + "/memory" + cgroup_name)): return 2 + else: + raise FileNotFoundError("Cgroup not found") def get_cgroup_name(): diff --git a/tests/unit/scripts/test_profiler.py b/tests/unit/scripts/test_profiler.py index 54398fbee7c..85a2914887e 100644 --- a/tests/unit/scripts/test_profiler.py +++ b/tests/unit/scripts/test_profiler.py @@ -104,8 +104,9 @@ def test_get_cgroup_version(mocker): # Mock the Path.exists function call to return False mocker.patch("pathlib.Path.exists", return_value=False) - assert get_cgroup_version('stuff/in/other/place', - 'things') is None + with pytest.raises(FileNotFoundError): + get_cgroup_version('stuff/in/other/place', + 'things') def test_get_cgroup_paths(): From 378dcb98d0962e9f5bc1c60dd8a2893420ee2b07 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Wed, 2 Apr 2025 13:57:52 +0100 Subject: [PATCH 026/101] More unit tests --- tests/unit/scripts/test_profiler.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/tests/unit/scripts/test_profiler.py b/tests/unit/scripts/test_profiler.py index 85a2914887e..07141e38f7d 100644 --- a/tests/unit/scripts/test_profiler.py +++ b/tests/unit/scripts/test_profiler.py @@ -160,3 +160,31 @@ def test_profile_max_rss(mocker): run_once = mock.Mock(side_effect=[True, False]) profile(process, 1, 1, run_once) mock_file.assert_called_with("max_rss", "w") + + +def test_profile_1(mocker): + process = get_cgroup_paths(1, "test_location/", "test_name") + + mock_file = mocker.mock_open(read_data="") + mocker.patch("builtins.open", mock_file) + mocker.patch("cylc.flow.scripts.profiler.parse_memory_file", return_value=1024) + mocker.patch("cylc.flow.scripts.profiler.parse_cpu_file", return_value=2048) + run_once = mock.Mock(side_effect=[True, False]) + + profile(process, 1, 1, run_once) + mock_file.assert_called_with("max_rss", "w") + + +def test_profile_2(mocker): + # assert_called_with only shows the last call to open(). + # Setting peak memory to zero stops the memory call to open + process = get_cgroup_paths(1, "test_location/", "test_name") + + mock_file = mocker.mock_open(read_data="") + mocker.patch("builtins.open", mock_file) + mocker.patch("cylc.flow.scripts.profiler.parse_cpu_file", return_value=2048) + mocker.patch("cylc.flow.scripts.profiler.parse_memory_file", return_value=0) + run_once = mock.Mock(side_effect=[True, False]) + + profile(process, 1, 1, run_once) + mock_file.assert_called_with("cpu_time", "w") From ca1e79692f626ee12cdf4d0e1c35b2cf9d8b2767 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Wed, 2 Apr 2025 14:00:44 +0100 Subject: [PATCH 027/101] Linting --- tests/unit/scripts/test_profiler.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/tests/unit/scripts/test_profiler.py b/tests/unit/scripts/test_profiler.py index 07141e38f7d..e0df1c9f881 100644 --- a/tests/unit/scripts/test_profiler.py +++ b/tests/unit/scripts/test_profiler.py @@ -163,12 +163,15 @@ def test_profile_max_rss(mocker): def test_profile_1(mocker): - process = get_cgroup_paths(1, "test_location/", "test_name") + process = get_cgroup_paths( + 1, "test_location/", "test_name") mock_file = mocker.mock_open(read_data="") mocker.patch("builtins.open", mock_file) - mocker.patch("cylc.flow.scripts.profiler.parse_memory_file", return_value=1024) - mocker.patch("cylc.flow.scripts.profiler.parse_cpu_file", return_value=2048) + mocker.patch( + "cylc.flow.scripts.profiler.parse_memory_file", return_value=1024) + mocker.patch( + "cylc.flow.scripts.profiler.parse_cpu_file", return_value=2048) run_once = mock.Mock(side_effect=[True, False]) profile(process, 1, 1, run_once) @@ -178,12 +181,15 @@ def test_profile_1(mocker): def test_profile_2(mocker): # assert_called_with only shows the last call to open(). # Setting peak memory to zero stops the memory call to open - process = get_cgroup_paths(1, "test_location/", "test_name") + process = get_cgroup_paths( + 1, "test_location/", "test_name") mock_file = mocker.mock_open(read_data="") mocker.patch("builtins.open", mock_file) - mocker.patch("cylc.flow.scripts.profiler.parse_cpu_file", return_value=2048) - mocker.patch("cylc.flow.scripts.profiler.parse_memory_file", return_value=0) + mocker.patch( + "cylc.flow.scripts.profiler.parse_cpu_file", return_value=2048) + mocker.patch( + "cylc.flow.scripts.profiler.parse_memory_file", return_value=0) run_once = mock.Mock(side_effect=[True, False]) profile(process, 1, 1, run_once) From 58fae4f325afd0c96f333e308757abea2fd3bd20 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Wed, 2 Apr 2025 15:19:45 +0100 Subject: [PATCH 028/101] Adding towncrier fragment --- changes.d/6663.feat.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 changes.d/6663.feat.md diff --git a/changes.d/6663.feat.md b/changes.d/6663.feat.md new file mode 100644 index 00000000000..9a40cc43d59 --- /dev/null +++ b/changes.d/6663.feat.md @@ -0,0 +1 @@ +Adding CPU time and Max RSS to Analysis Tools From 64e9b2b305634986451e07cd0c70223887bee5a9 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Mon, 7 Apr 2025 13:54:58 +0100 Subject: [PATCH 029/101] Review changes --- cylc/flow/cfgspec/globalcfg.py | 7 +++- cylc/flow/etc/job.sh | 8 ++--- cylc/flow/job_file.py | 3 ++ cylc/flow/scripts/profiler.py | 11 ++++--- .../jobscript/02-profiler/flow.cylc | 20 ------------ tests/unit/scripts/test_profiler.py | 32 +++++++++++++++++++ 6 files changed, 52 insertions(+), 29 deletions(-) diff --git a/cylc/flow/cfgspec/globalcfg.py b/cylc/flow/cfgspec/globalcfg.py index 821b3fd3d1f..84d25c132e2 100644 --- a/cylc/flow/cfgspec/globalcfg.py +++ b/cylc/flow/cfgspec/globalcfg.py @@ -1337,7 +1337,12 @@ def default_for( .. versionadded:: 8.0.0 ''') - Conf('cgroups path', VDR.V_STRING, '/sys/fs/cgroup') + Conf('cgroups path', VDR.V_STRING, + default='/sys/fs/cgroup', + desc=''' + The path to the cgroups filesystem. The default value + (/sys/fs/cgroup) is the standard location for cgroups on + linux and should work in most circumstances''') Conf('job runner', VDR.V_STRING, 'background', desc=f''' The system used to run jobs on the platform. diff --git a/cylc/flow/etc/job.sh b/cylc/flow/etc/job.sh index a4747579538..143a70a9a41 100755 --- a/cylc/flow/etc/job.sh +++ b/cylc/flow/etc/job.sh @@ -141,7 +141,7 @@ cylc__job__main() { cd "${CYLC_TASK_WORK_DIR}" if [[ "${CYLC_PROFILE}" == "True" ]] ; then - cylc profile & + cylc profile -m "${CYLC_CGROUP}" & export profiler_pid="$!" fi @@ -163,20 +163,20 @@ cylc__job__main() { cylc__set_return "$ret_code" fi } + cylc__kill_profiler # Grab the max rss and cpu_time value before moving directory if [[ -f "max_rss" ]]; then - max_rss=$(sed -n '1p' max_rss) + max_rss=$(sed -n 'p' max_rss) rm max_rss fi if [[ -f "cpu_time" ]]; then - cpu_time=$(sed -n '1p' cpu_time) + cpu_time=$(sed -n 'p' cpu_time) rm cpu_time fi # Empty work directory remove cd rmdir "${CYLC_TASK_WORK_DIR}" 2>'/dev/null' || true # Send task succeeded message - cylc__kill_profiler wait "${CYLC_TASK_MESSAGE_STARTED_PID}" 2>'/dev/null' || true diff --git a/cylc/flow/job_file.py b/cylc/flow/job_file.py index 489b3f99c47..a9991e88f89 100644 --- a/cylc/flow/job_file.py +++ b/cylc/flow/job_file.py @@ -228,6 +228,9 @@ def _write_task_environment(self, handle, job_conf): handle.write( "\n export CYLC_PROFILE=" f"{job_conf['platform']['profile']['activate']}") + handle.write( + "\n export CYLC_CGROUP=" + f"{job_conf['platform']['profile']['cgroups path']}") # Standard parameter environment variables for var, val in job_conf['param_var'].items(): handle.write('\n export CYLC_TASK_PARAM_%s="%s"' % (var, val)) diff --git a/cylc/flow/scripts/profiler.py b/cylc/flow/scripts/profiler.py index b31c564638f..8f2d2981d54 100755 --- a/cylc/flow/scripts/profiler.py +++ b/cylc/flow/scripts/profiler.py @@ -46,7 +46,6 @@ def get_option_parser() -> COP: default=10, dest="delay") parser.add_option( "-m", type=str, help="Location of cgroups directory", - default="/sys/fs/cgroup", dest="cgroup_location") return parser @@ -101,8 +100,12 @@ def parse_cpu_file(cgroup_cpu_path, cgroup_version): def write_data(data, filename): - with open(filename, 'w') as f: - f.write(data + "\n") + try: + with open(filename, 'w') as f: + f.write(data + "\n") + except FileNotFoundError as err: + print(filename, data) + print(err) def get_cgroup_version(cgroup_location: str, cgroup_name: str) -> int: @@ -175,7 +178,7 @@ def get_config(args): cgroup_name = get_cgroup_name() cgroup_version = get_cgroup_version(args.cgroup_location, cgroup_name) process = get_cgroup_paths(cgroup_version, - args.cgroups_location, + args.cgroup_location, cgroup_name) profile(process, cgroup_version, args.delay) diff --git a/tests/functional/jobscript/02-profiler/flow.cylc b/tests/functional/jobscript/02-profiler/flow.cylc index 951a5b94d20..77e5c2356dc 100644 --- a/tests/functional/jobscript/02-profiler/flow.cylc +++ b/tests/functional/jobscript/02-profiler/flow.cylc @@ -1,30 +1,10 @@ -[meta] - title = "job script torture test" - - description = """Any task job script may fail regardless of user runtime -settings if changes to cylc re-order the job script sections badly: e.g. -"cylc task started" must be called after the CYLC_ environment variables -are exported. Additionally, users may rely on the order of variable -definition in each environment and script section: e.g. workflow -bin directory must go in PATH before the task runtime environment is -defined because workflow bin commands could be used in variable assignment -expressions.""" - [scheduling] [[graph]] R1 = "foo" [runtime] [[foo]] platform = localhost - init-script = """ -echo "HELLO FROM INIT-SCRIPT" -# define a variable -export VAR_IS=is""" pre-script = """ -echo "HELLO FROM PRE-SCRIPT" -# init-script must be done: -echo VAR_IS is $VAR_IS -# user environment must be done: echo E_ONE is $E_ONE echo E_TWO is $E_TWO echo E_THR is $E_THR diff --git a/tests/unit/scripts/test_profiler.py b/tests/unit/scripts/test_profiler.py index e0df1c9f881..d83c2388ef1 100644 --- a/tests/unit/scripts/test_profiler.py +++ b/tests/unit/scripts/test_profiler.py @@ -15,15 +15,19 @@ # along with this program. If not, see . # # Tests for functions contained in cylc.flow.scripts.profiler +import unittest.mock + from cylc.flow.scripts.profiler import (parse_memory_file, parse_cpu_file, write_data, get_cgroup_name, get_cgroup_version, get_cgroup_paths, + get_config, stop_profiler, profile) import pytest +import argparse from unittest import mock @@ -194,3 +198,31 @@ def test_profile_2(mocker): profile(process, 1, 1, run_once) mock_file.assert_called_with("cpu_time", "w") + + +def test_get_config(mocker): + + # Mock the 'open' function call to return a valid string. + mock_file = mocker.mock_open(read_data="0::good/cgroup/place/2222222") + mocker.patch("builtins.open", mock_file) + + # Mock the get_cgroup_version function so it says the cgroup path is valid + mocker.patch("cylc.flow.scripts.profiler.get_cgroup_version", + return_value=1) + # Mock the parse functions so they return valid values + mocker.patch("cylc.flow.scripts.profiler.parse_memory_file", + return_value=1024) + mocker.patch("cylc.flow.scripts.profiler.parse_cpu_file", return_value=2048) + + # Mock the write_data function to simulate writing data. It will error out on the 3rd call + mock_write = mock.Mock(side_effect=[None, None, FileNotFoundError('Carpe Diem')]) + mocker.patch("cylc.flow.scripts.profiler.write_data", mock_write) + + parser = argparse.ArgumentParser() + parser.add_argument( + "-i", type=int, default=10, dest="delay") + parser.add_argument( + "-m", type=str, default="test_location/", + dest="cgroup_location") + with pytest.raises(FileNotFoundError): + get_config(parser.parse_args()) From 07348b76a1b02cfb022f8a06bd9569cecbcdca74 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Mon, 7 Apr 2025 14:06:51 +0100 Subject: [PATCH 030/101] Review changes --- cylc/flow/cfgspec/globalcfg.py | 2 +- tests/unit/scripts/test_profiler.py | 8 ++++---- tests/unit/test_job_file.py | 2 ++ 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/cylc/flow/cfgspec/globalcfg.py b/cylc/flow/cfgspec/globalcfg.py index 84d25c132e2..fcbe8e85c41 100644 --- a/cylc/flow/cfgspec/globalcfg.py +++ b/cylc/flow/cfgspec/globalcfg.py @@ -1341,7 +1341,7 @@ def default_for( default='/sys/fs/cgroup', desc=''' The path to the cgroups filesystem. The default value - (/sys/fs/cgroup) is the standard location for cgroups on + (/sys/fs/cgroup) is the standard location for cgroups on linux and should work in most circumstances''') Conf('job runner', VDR.V_STRING, 'background', desc=f''' diff --git a/tests/unit/scripts/test_profiler.py b/tests/unit/scripts/test_profiler.py index d83c2388ef1..b833882d0ae 100644 --- a/tests/unit/scripts/test_profiler.py +++ b/tests/unit/scripts/test_profiler.py @@ -15,8 +15,6 @@ # along with this program. If not, see . # # Tests for functions contained in cylc.flow.scripts.profiler -import unittest.mock - from cylc.flow.scripts.profiler import (parse_memory_file, parse_cpu_file, write_data, @@ -214,8 +212,10 @@ def test_get_config(mocker): return_value=1024) mocker.patch("cylc.flow.scripts.profiler.parse_cpu_file", return_value=2048) - # Mock the write_data function to simulate writing data. It will error out on the 3rd call - mock_write = mock.Mock(side_effect=[None, None, FileNotFoundError('Carpe Diem')]) + # Mock the write_data function to simulate writing data. + # It will error out on the 3rd call + mock_write = mock.Mock( + side_effect=[None, None, FileNotFoundError('Carpe Diem')]) mocker.patch("cylc.flow.scripts.profiler.write_data", mock_write) parser = argparse.ArgumentParser() diff --git a/tests/unit/test_job_file.py b/tests/unit/test_job_file.py index 617d02fc1e1..f6dc1093202 100644 --- a/tests/unit/test_job_file.py +++ b/tests/unit/test_job_file.py @@ -400,6 +400,7 @@ def test_write_task_environment(): 'CYLC_TASK_TRY_NUMBER=1\n export ' 'CYLC_TASK_FLOW_NUMBERS=1\n export ' 'CYLC_PROFILE=true\n export ' + 'CYLC_CGROUP=exit_light\n export ' 'CYLC_TASK_PARAM_duck="quack"\n export ' 'CYLC_TASK_PARAM_mouse="squeak"\n ' 'CYLC_TASK_WORK_DIR_BASE=\'farm_noises/work_d\'\n}') @@ -408,6 +409,7 @@ def test_write_task_environment(): 'communication method': 'ssh', 'profile': { "activate": "true", + "cgroups path": 'exit_light' } }, "job_d": "1/moo/01", From b485fd7e84b7921b1e11a8794db809e1899fc021 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Mon, 7 Apr 2025 14:56:57 +0100 Subject: [PATCH 031/101] Review Changes --- tests/functional/jobscript/02-profiler/flow.cylc | 8 ++++++++ tests/unit/scripts/test_profiler.py | 3 ++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/tests/functional/jobscript/02-profiler/flow.cylc b/tests/functional/jobscript/02-profiler/flow.cylc index 77e5c2356dc..6fee252ce40 100644 --- a/tests/functional/jobscript/02-profiler/flow.cylc +++ b/tests/functional/jobscript/02-profiler/flow.cylc @@ -4,7 +4,15 @@ [runtime] [[foo]] platform = localhost + init-script = """ +echo "HELLO FROM INIT-SCRIPT" +# define a variable +export VAR_IS=is""" pre-script = """ +echo "HELLO FROM PRE-SCRIPT" +# init-script must be done: +echo VAR_IS is $VAR_IS +# user environment must be done: echo E_ONE is $E_ONE echo E_TWO is $E_TWO echo E_THR is $E_THR diff --git a/tests/unit/scripts/test_profiler.py b/tests/unit/scripts/test_profiler.py index b833882d0ae..837e9786044 100644 --- a/tests/unit/scripts/test_profiler.py +++ b/tests/unit/scripts/test_profiler.py @@ -210,7 +210,8 @@ def test_get_config(mocker): # Mock the parse functions so they return valid values mocker.patch("cylc.flow.scripts.profiler.parse_memory_file", return_value=1024) - mocker.patch("cylc.flow.scripts.profiler.parse_cpu_file", return_value=2048) + mocker.patch("cylc.flow.scripts.profiler.parse_cpu_file", + return_value=2048) # Mock the write_data function to simulate writing data. # It will error out on the 3rd call From 5793b23011e21c364bd8538723ef22582fa7755b Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Tue, 8 Apr 2025 11:09:12 +0100 Subject: [PATCH 032/101] Review Changes --- cylc/flow/etc/job.sh | 4 ++-- cylc/flow/scripts/profiler.py | 4 ---- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/cylc/flow/etc/job.sh b/cylc/flow/etc/job.sh index 143a70a9a41..0b29e3774e4 100755 --- a/cylc/flow/etc/job.sh +++ b/cylc/flow/etc/job.sh @@ -166,11 +166,11 @@ cylc__job__main() { cylc__kill_profiler # Grab the max rss and cpu_time value before moving directory if [[ -f "max_rss" ]]; then - max_rss=$(sed -n 'p' max_rss) + max_rss=$(cat max_rss) rm max_rss fi if [[ -f "cpu_time" ]]; then - cpu_time=$(sed -n 'p' cpu_time) + cpu_time=$(cat cpu_time) rm cpu_time fi # Empty work directory remove diff --git a/cylc/flow/scripts/profiler.py b/cylc/flow/scripts/profiler.py index 8f2d2981d54..81d6302a648 100755 --- a/cylc/flow/scripts/profiler.py +++ b/cylc/flow/scripts/profiler.py @@ -100,12 +100,8 @@ def parse_cpu_file(cgroup_cpu_path, cgroup_version): def write_data(data, filename): - try: with open(filename, 'w') as f: f.write(data + "\n") - except FileNotFoundError as err: - print(filename, data) - print(err) def get_cgroup_version(cgroup_location: str, cgroup_name: str) -> int: From 73545ac72d86f0df39b74c9205715dff86056cbd Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Tue, 8 Apr 2025 11:16:34 +0100 Subject: [PATCH 033/101] Review Changes --- cylc/flow/scripts/profiler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cylc/flow/scripts/profiler.py b/cylc/flow/scripts/profiler.py index 81d6302a648..53d94b01061 100755 --- a/cylc/flow/scripts/profiler.py +++ b/cylc/flow/scripts/profiler.py @@ -100,8 +100,8 @@ def parse_cpu_file(cgroup_cpu_path, cgroup_version): def write_data(data, filename): - with open(filename, 'w') as f: - f.write(data + "\n") + with open(filename, 'w') as f: + f.write(data + "\n") def get_cgroup_version(cgroup_location: str, cgroup_name: str) -> int: From bf1b9c9527f0e9d045f20c6f832fdf07ddc6d15d Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Thu, 10 Apr 2025 15:23:45 +0100 Subject: [PATCH 034/101] Review Changes --- cylc/flow/etc/job.sh | 2 +- cylc/flow/scripts/profiler.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cylc/flow/etc/job.sh b/cylc/flow/etc/job.sh index 0b29e3774e4..5cfb4ac7e0d 100755 --- a/cylc/flow/etc/job.sh +++ b/cylc/flow/etc/job.sh @@ -163,7 +163,6 @@ cylc__job__main() { cylc__set_return "$ret_code" fi } - cylc__kill_profiler # Grab the max rss and cpu_time value before moving directory if [[ -f "max_rss" ]]; then max_rss=$(cat max_rss) @@ -173,6 +172,7 @@ cylc__job__main() { cpu_time=$(cat cpu_time) rm cpu_time fi + cylc__kill_profiler # Empty work directory remove cd rmdir "${CYLC_TASK_WORK_DIR}" 2>'/dev/null' || true diff --git a/cylc/flow/scripts/profiler.py b/cylc/flow/scripts/profiler.py index 53d94b01061..0a7a6714b65 100755 --- a/cylc/flow/scripts/profiler.py +++ b/cylc/flow/scripts/profiler.py @@ -101,7 +101,7 @@ def parse_cpu_file(cgroup_cpu_path, cgroup_version): def write_data(data, filename): with open(filename, 'w') as f: - f.write(data + "\n") + f.write(data) def get_cgroup_version(cgroup_location: str, cgroup_name: str) -> int: From 30d4382a32ff517eff01bbbed68459405c1cbb86 Mon Sep 17 00:00:00 2001 From: Oliver Sanders Date: Thu, 10 Apr 2025 09:46:36 +0100 Subject: [PATCH 035/101] profiler: add functional test for cgroup profiling --- tests/functional/jobscript/02-profiler.t | 86 ++++++++++++++----- .../jobscript/02-profiler/bin/foo.sh | 2 - .../jobscript/02-profiler/flow.cylc | 65 -------------- .../jobscript/02-profiler/reference.log | 3 - 4 files changed, 65 insertions(+), 91 deletions(-) delete mode 100755 tests/functional/jobscript/02-profiler/bin/foo.sh delete mode 100644 tests/functional/jobscript/02-profiler/flow.cylc delete mode 100644 tests/functional/jobscript/02-profiler/reference.log diff --git a/tests/functional/jobscript/02-profiler.t b/tests/functional/jobscript/02-profiler.t index e43705d4720..9d369e70728 100644 --- a/tests/functional/jobscript/02-profiler.t +++ b/tests/functional/jobscript/02-profiler.t @@ -15,31 +15,75 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . #------------------------------------------------------------------------------- -# cylc profile test +# Cylc profile test +# NOTE: This test will run the Cylc profiler on the given test platform. +# The test platform may need to be configured for this to work (e.g. +# "cgroups path" may need to be set). +REQUIRE_PLATFORM='runner:?(pbs|slurm) comms:tcp' . "$(dirname "$0")/test_header" -#------------------------------------------------------------------------------- -set_test_number 2 -#------------------------------------------------------------------------------- -install_workflow "${TEST_NAME_BASE}" "${TEST_NAME_BASE}" -#------------------------------------------------------------------------------- -TEST_NAME="${TEST_NAME_BASE}-validate" -run_ok "${TEST_NAME}" cylc validate "${WORKFLOW_NAME}" - -if [[ -n "${PYTHONPATH:-}" ]]; then - export PYTHONPATH="${PWD}/lib:${PYTHONPATH}" -else - export PYTHONPATH="${PWD}/lib" -fi +set_test_number 12 -export PATH_TO_CYLC_BIN="/path/to/cylc/bin" -create_test_global_config ' +create_test_global_config " [platforms] + [[${CYLC_TEST_PLATFORM}]] + [[[profile]]] + activate = True + # TODO: set the interval to something like 1s [[localhost]] [[[profile]]] - activate = true -' -#------------------------------------------------------------------------------- -TEST_NAME="${TEST_NAME_BASE}-run" -workflow_run_ok "${TEST_NAME}" cylc play --reference-test --debug --no-detach "${WORKFLOW_NAME}" + activate = True + cgroups path = /no/such/path +" + +init_workflow "${TEST_NAME_BASE}" <<'__FLOW_CONFIG__' +#!Jinja2 + +[scheduling] + [[graph]] + R1 = the_good & the_bad? & the_ugly + +[runtime] + [[the_good]] + # this task should succeeded normally + platform = {{ environ['CYLC_TEST_PLATFORM'] }} + script = sleep 1 + [[the_bad]] + # this task should fail (it should still send profiling info) + platform = {{ environ['CYLC_TEST_PLATFORM'] }} + script = false + [[the_ugly]] + # this task should succeed despite the broken profiler configuration + platform = localhost + script = sleep 1 +__FLOW_CONFIG__ + +run_ok "${TEST_NAME_BASE}-validate" cylc validate "${WORKFLOW_NAME}" +workflow_run_ok "${TEST_NAME_BASE}-run" cylc play --debug --no-detach "${WORKFLOW_NAME}" + +# ensure the cpu and memory messages were received and that these messages +# were received before the succeeded message +log_scan "${TEST_NAME_BASE}-task-succeeded" \ + "${WORKFLOW_RUN_DIR}/log/scheduler/log" 1 0 \ + '1/the_good.*(received)cpu_time.*' \ + '1/the_good.*(received)succeeded' +log_scan "${TEST_NAME_BASE}-task-succeeded" \ + "${WORKFLOW_RUN_DIR}/log/scheduler/log" 1 0 \ + '1/the_good.*(received)max_rss.*' \ + '1/the_good.*(received)succeeded' + +# ensure the cpu and memory messages were received and that these messages +# were received before the failed message +log_scan "${TEST_NAME_BASE}-task-succeeded" \ + "${WORKFLOW_RUN_DIR}/log/scheduler/log" 1 0 \ + '1/the_bad.*(received)cpu_time.*' \ + '1/the_bad.*(received)failed' +log_scan "${TEST_NAME_BASE}-task-succeeded" \ + "${WORKFLOW_RUN_DIR}/log/scheduler/log" 1 0 \ + '1/the_bad.*(received)max_rss.*' \ + '1/the_bad.*(received)failed' + +# ensure this task succeeded despite the broken profiler configuration +grep_workflow_log_ok "${TEST_NAME_BASE}-broken" '1/the_ugly.*(received)succeeded' +grep_ok 'FileNotFoundError: Cgroup not found' "$(cylc cat-log "${WORKFLOW_NAME}//1/the_ugly" -f e -m p)" purge diff --git a/tests/functional/jobscript/02-profiler/bin/foo.sh b/tests/functional/jobscript/02-profiler/bin/foo.sh deleted file mode 100755 index 4b20577c0d0..00000000000 --- a/tests/functional/jobscript/02-profiler/bin/foo.sh +++ /dev/null @@ -1,2 +0,0 @@ -#!/usr/bin/env bash -echo "Hello from $0" diff --git a/tests/functional/jobscript/02-profiler/flow.cylc b/tests/functional/jobscript/02-profiler/flow.cylc deleted file mode 100644 index 6fee252ce40..00000000000 --- a/tests/functional/jobscript/02-profiler/flow.cylc +++ /dev/null @@ -1,65 +0,0 @@ -[scheduling] - [[graph]] - R1 = "foo" -[runtime] - [[foo]] - platform = localhost - init-script = """ -echo "HELLO FROM INIT-SCRIPT" -# define a variable -export VAR_IS=is""" - pre-script = """ -echo "HELLO FROM PRE-SCRIPT" -# init-script must be done: -echo VAR_IS is $VAR_IS -# user environment must be done: -echo E_ONE is $E_ONE -echo E_TWO is $E_TWO -echo E_THR is $E_THR -echo E_FOU is $E_FOU -echo E_FIV is $E_FIV -# define a variable -export VAR_PreCS=precs""" - script = """ -echo "HELLO FROM SCRIPT" -# init-script must be done: -echo VAR_IS is $VAR_IS -# pre-script must be done: -echo VAR_PreCS is $VAR_PreCS -# environment must be done: -echo E_ONE is $E_ONE -echo E_TWO is $E_TWO -echo E_THR is $E_THR -echo E_FOU is $E_FOU -echo E_FIV is $E_FIV -# define a variable -export VAR_CS=var_cs""" - post-script = """ -echo "HELLO FROM POST-SCRIPT" -# init-script must be done: -echo VAR_IS is $VAR_IS -# pre-script must be done: -echo VAR_PreCS is $VAR_PreCS -# script must be done: -echo VAR_CS is $VAR_CS -# environment must be done: -echo E_ONE is $E_ONE -echo E_TWO is $E_TWO -echo E_THR is $E_THR -echo E_FOU is $E_FOU -echo E_FIV is $E_FIV -echo VAR_IS is $VAR_IS -echo VAR_PreCS is $VAR_PreCS -echo VAR_CS is $VAR_CS -# define a variable -export VAR_PostCS=postcs""" - [[[environment]]] - # path to cylc must be available: - E_ONE = $(( RANDOM % 10 )) - # init-script must be done: - E_TWO = $VAR_IS - # cylc-defined variables must be done: - E_THR = $CYLC_WORKFLOW_SHARE_DIR - E_FOU = $CYLC_TASK_NAME - # the workflow bin must be in $PATH already: - E_FIV = $( foo.sh ) diff --git a/tests/functional/jobscript/02-profiler/reference.log b/tests/functional/jobscript/02-profiler/reference.log deleted file mode 100644 index 08fe5d5558a..00000000000 --- a/tests/functional/jobscript/02-profiler/reference.log +++ /dev/null @@ -1,3 +0,0 @@ -Initial point: 1 -Final point: 1 -1/foo -triggered off [] From 49fcbc891aa4abedee4f185d253c34237599098b Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Mon, 7 Apr 2025 13:54:58 +0100 Subject: [PATCH 036/101] Review Changes Review changes --- cylc/flow/cfgspec/globalcfg.py | 7 +++- cylc/flow/etc/job.sh | 8 ++--- cylc/flow/job_file.py | 3 ++ cylc/flow/scripts/profiler.py | 5 ++- .../jobscript/02-profiler/flow.cylc | 12 ------- tests/unit/scripts/test_profiler.py | 35 ++++++++++++++++++- tests/unit/test_job_file.py | 2 ++ 7 files changed, 51 insertions(+), 21 deletions(-) diff --git a/cylc/flow/cfgspec/globalcfg.py b/cylc/flow/cfgspec/globalcfg.py index 821b3fd3d1f..fcbe8e85c41 100644 --- a/cylc/flow/cfgspec/globalcfg.py +++ b/cylc/flow/cfgspec/globalcfg.py @@ -1337,7 +1337,12 @@ def default_for( .. versionadded:: 8.0.0 ''') - Conf('cgroups path', VDR.V_STRING, '/sys/fs/cgroup') + Conf('cgroups path', VDR.V_STRING, + default='/sys/fs/cgroup', + desc=''' + The path to the cgroups filesystem. The default value + (/sys/fs/cgroup) is the standard location for cgroups on + linux and should work in most circumstances''') Conf('job runner', VDR.V_STRING, 'background', desc=f''' The system used to run jobs on the platform. diff --git a/cylc/flow/etc/job.sh b/cylc/flow/etc/job.sh index a4747579538..5cfb4ac7e0d 100755 --- a/cylc/flow/etc/job.sh +++ b/cylc/flow/etc/job.sh @@ -141,7 +141,7 @@ cylc__job__main() { cd "${CYLC_TASK_WORK_DIR}" if [[ "${CYLC_PROFILE}" == "True" ]] ; then - cylc profile & + cylc profile -m "${CYLC_CGROUP}" & export profiler_pid="$!" fi @@ -165,18 +165,18 @@ cylc__job__main() { } # Grab the max rss and cpu_time value before moving directory if [[ -f "max_rss" ]]; then - max_rss=$(sed -n '1p' max_rss) + max_rss=$(cat max_rss) rm max_rss fi if [[ -f "cpu_time" ]]; then - cpu_time=$(sed -n '1p' cpu_time) + cpu_time=$(cat cpu_time) rm cpu_time fi + cylc__kill_profiler # Empty work directory remove cd rmdir "${CYLC_TASK_WORK_DIR}" 2>'/dev/null' || true # Send task succeeded message - cylc__kill_profiler wait "${CYLC_TASK_MESSAGE_STARTED_PID}" 2>'/dev/null' || true diff --git a/cylc/flow/job_file.py b/cylc/flow/job_file.py index 489b3f99c47..a9991e88f89 100644 --- a/cylc/flow/job_file.py +++ b/cylc/flow/job_file.py @@ -228,6 +228,9 @@ def _write_task_environment(self, handle, job_conf): handle.write( "\n export CYLC_PROFILE=" f"{job_conf['platform']['profile']['activate']}") + handle.write( + "\n export CYLC_CGROUP=" + f"{job_conf['platform']['profile']['cgroups path']}") # Standard parameter environment variables for var, val in job_conf['param_var'].items(): handle.write('\n export CYLC_TASK_PARAM_%s="%s"' % (var, val)) diff --git a/cylc/flow/scripts/profiler.py b/cylc/flow/scripts/profiler.py index b31c564638f..0a7a6714b65 100755 --- a/cylc/flow/scripts/profiler.py +++ b/cylc/flow/scripts/profiler.py @@ -46,7 +46,6 @@ def get_option_parser() -> COP: default=10, dest="delay") parser.add_option( "-m", type=str, help="Location of cgroups directory", - default="/sys/fs/cgroup", dest="cgroup_location") return parser @@ -102,7 +101,7 @@ def parse_cpu_file(cgroup_cpu_path, cgroup_version): def write_data(data, filename): with open(filename, 'w') as f: - f.write(data + "\n") + f.write(data) def get_cgroup_version(cgroup_location: str, cgroup_name: str) -> int: @@ -175,7 +174,7 @@ def get_config(args): cgroup_name = get_cgroup_name() cgroup_version = get_cgroup_version(args.cgroup_location, cgroup_name) process = get_cgroup_paths(cgroup_version, - args.cgroups_location, + args.cgroup_location, cgroup_name) profile(process, cgroup_version, args.delay) diff --git a/tests/functional/jobscript/02-profiler/flow.cylc b/tests/functional/jobscript/02-profiler/flow.cylc index 951a5b94d20..6fee252ce40 100644 --- a/tests/functional/jobscript/02-profiler/flow.cylc +++ b/tests/functional/jobscript/02-profiler/flow.cylc @@ -1,15 +1,3 @@ -[meta] - title = "job script torture test" - - description = """Any task job script may fail regardless of user runtime -settings if changes to cylc re-order the job script sections badly: e.g. -"cylc task started" must be called after the CYLC_ environment variables -are exported. Additionally, users may rely on the order of variable -definition in each environment and script section: e.g. workflow -bin directory must go in PATH before the task runtime environment is -defined because workflow bin commands could be used in variable assignment -expressions.""" - [scheduling] [[graph]] R1 = "foo" diff --git a/tests/unit/scripts/test_profiler.py b/tests/unit/scripts/test_profiler.py index e0df1c9f881..db2f97d894d 100644 --- a/tests/unit/scripts/test_profiler.py +++ b/tests/unit/scripts/test_profiler.py @@ -21,9 +21,11 @@ get_cgroup_name, get_cgroup_version, get_cgroup_paths, + get_config, stop_profiler, profile) import pytest +import argparse from unittest import mock @@ -66,7 +68,7 @@ def test_write_data(tmpdir): file = tmpdir.join('output.txt') write_data('test_data', file.strpath) - assert file.read() == 'test_data\n' + assert file.read() == 'test_data' def test_get_cgroup_name(mocker): @@ -194,3 +196,34 @@ def test_profile_2(mocker): profile(process, 1, 1, run_once) mock_file.assert_called_with("cpu_time", "w") + + +def test_get_config(mocker): + + # Mock the 'open' function call to return a valid string. + mock_file = mocker.mock_open(read_data="0::good/cgroup/place/2222222") + mocker.patch("builtins.open", mock_file) + + # Mock the get_cgroup_version function so it says the cgroup path is valid + mocker.patch("cylc.flow.scripts.profiler.get_cgroup_version", + return_value=1) + # Mock the parse functions so they return valid values + mocker.patch("cylc.flow.scripts.profiler.parse_memory_file", + return_value=1024) + mocker.patch("cylc.flow.scripts.profiler.parse_cpu_file", + return_value=2048) + + # Mock the write_data function to simulate writing data. + # It will error out on the 3rd call + mock_write = mock.Mock( + side_effect=[None, None, FileNotFoundError('Carpe Diem')]) + mocker.patch("cylc.flow.scripts.profiler.write_data", mock_write) + + parser = argparse.ArgumentParser() + parser.add_argument( + "-i", type=int, default=10, dest="delay") + parser.add_argument( + "-m", type=str, default="test_location/", + dest="cgroup_location") + with pytest.raises(FileNotFoundError): + get_config(parser.parse_args()) diff --git a/tests/unit/test_job_file.py b/tests/unit/test_job_file.py index 617d02fc1e1..f6dc1093202 100644 --- a/tests/unit/test_job_file.py +++ b/tests/unit/test_job_file.py @@ -400,6 +400,7 @@ def test_write_task_environment(): 'CYLC_TASK_TRY_NUMBER=1\n export ' 'CYLC_TASK_FLOW_NUMBERS=1\n export ' 'CYLC_PROFILE=true\n export ' + 'CYLC_CGROUP=exit_light\n export ' 'CYLC_TASK_PARAM_duck="quack"\n export ' 'CYLC_TASK_PARAM_mouse="squeak"\n ' 'CYLC_TASK_WORK_DIR_BASE=\'farm_noises/work_d\'\n}') @@ -408,6 +409,7 @@ def test_write_task_environment(): 'communication method': 'ssh', 'profile': { "activate": "true", + "cgroups path": 'exit_light' } }, "job_d": "1/moo/01", From c63a250315c6428334942c7f1aac3903afe85f14 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Tue, 15 Apr 2025 10:06:32 +0100 Subject: [PATCH 037/101] Added polling interval configuration --- cylc/flow/cfgspec/globalcfg.py | 8 ++++++++ cylc/flow/etc/job.sh | 2 +- cylc/flow/job_file.py | 3 +++ cylc/flow/scripts/profiler.py | 4 ++-- 4 files changed, 14 insertions(+), 3 deletions(-) diff --git a/cylc/flow/cfgspec/globalcfg.py b/cylc/flow/cfgspec/globalcfg.py index fcbe8e85c41..b5c19263e0f 100644 --- a/cylc/flow/cfgspec/globalcfg.py +++ b/cylc/flow/cfgspec/globalcfg.py @@ -1343,6 +1343,14 @@ def default_for( The path to the cgroups filesystem. The default value (/sys/fs/cgroup) is the standard location for cgroups on linux and should work in most circumstances''') + Conf('polling interval', VDR.V_INTEGER, + default=10, + desc=''' + The interval (in seconds) at which the profiler will + poll the cgroups filesystem for resource usage data. + The default value of 10 seconds should be sufficient for + most use cases, but can be adjusted as needed. + ''') Conf('job runner', VDR.V_STRING, 'background', desc=f''' The system used to run jobs on the platform. diff --git a/cylc/flow/etc/job.sh b/cylc/flow/etc/job.sh index 5cfb4ac7e0d..f16fd674f4f 100755 --- a/cylc/flow/etc/job.sh +++ b/cylc/flow/etc/job.sh @@ -141,7 +141,7 @@ cylc__job__main() { cd "${CYLC_TASK_WORK_DIR}" if [[ "${CYLC_PROFILE}" == "True" ]] ; then - cylc profile -m "${CYLC_CGROUP}" & + cylc profile -m "${CYLC_CGROUP}" -i "${CYLC_POLLING_INTERVAL}" & export profiler_pid="$!" fi diff --git a/cylc/flow/job_file.py b/cylc/flow/job_file.py index a9991e88f89..cc44aa42e03 100644 --- a/cylc/flow/job_file.py +++ b/cylc/flow/job_file.py @@ -231,6 +231,9 @@ def _write_task_environment(self, handle, job_conf): handle.write( "\n export CYLC_CGROUP=" f"{job_conf['platform']['profile']['cgroups path']}") + handle.write( + "\n export CYLC_POLLING_INTERVAL=" + f"{job_conf['platform']['profile']['polling interval']}") # Standard parameter environment variables for var, val in job_conf['param_var'].items(): handle.write('\n export CYLC_TASK_PARAM_%s="%s"' % (var, val)) diff --git a/cylc/flow/scripts/profiler.py b/cylc/flow/scripts/profiler.py index 0a7a6714b65..3448d29a6be 100755 --- a/cylc/flow/scripts/profiler.py +++ b/cylc/flow/scripts/profiler.py @@ -42,8 +42,8 @@ def get_option_parser() -> COP: ], ) parser.add_option( - "-i", type=int, help="interval between query cycles in seconds", - default=10, dest="delay") + "-i", type=int, + help="interval between query cycles in seconds", dest="delay") parser.add_option( "-m", type=str, help="Location of cgroups directory", dest="cgroup_location") From 939e12891e7579492662ff8820995bb6b7442dc2 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Tue, 15 Apr 2025 10:16:30 +0100 Subject: [PATCH 038/101] Update unit tests --- tests/unit/test_job_file.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/unit/test_job_file.py b/tests/unit/test_job_file.py index f6dc1093202..3ccd8d2923c 100644 --- a/tests/unit/test_job_file.py +++ b/tests/unit/test_job_file.py @@ -401,6 +401,7 @@ def test_write_task_environment(): 'CYLC_TASK_FLOW_NUMBERS=1\n export ' 'CYLC_PROFILE=true\n export ' 'CYLC_CGROUP=exit_light\n export ' + 'CYLC_POLLING_INTERVAL=1\n export ' 'CYLC_TASK_PARAM_duck="quack"\n export ' 'CYLC_TASK_PARAM_mouse="squeak"\n ' 'CYLC_TASK_WORK_DIR_BASE=\'farm_noises/work_d\'\n}') @@ -409,7 +410,8 @@ def test_write_task_environment(): 'communication method': 'ssh', 'profile': { "activate": "true", - "cgroups path": 'exit_light' + "cgroups path": 'exit_light', + "polling interval": 1 } }, "job_d": "1/moo/01", From 4928405d6e244fba2b139d411cd31145539ca16b Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Wed, 16 Apr 2025 15:15:30 +0100 Subject: [PATCH 039/101] Added name to CONTRIBUTING.md --- CONTRIBUTING.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index c576e12c4a0..ce9222a6604 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -99,6 +99,7 @@ requests_). - Maxime Rio - Paul Armstrong - Paul Earnshaw + - Christopher Bennett (All contributors are identifiable with email addresses in the git version From 66acd1f7a736749c786fe4aac3b42cfd1af5077c Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Thu, 17 Apr 2025 10:37:12 +0100 Subject: [PATCH 040/101] Added name to .mailmap --- .mailmap | 1 + 1 file changed, 1 insertion(+) diff --git a/.mailmap b/.mailmap index 7bbf47d2375..5fcd097170a 100644 --- a/.mailmap +++ b/.mailmap @@ -58,3 +58,4 @@ github-actions[bot] github-actions[bot] GitHub Action Diquan Jabbour <165976689+Diquan-BOM@users.noreply.github.com> Maxime Rio +Christopher Bennett ChrisPaulBennett From cacf077e2d15fa0799895a38c8143e93e08b6de8 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Tue, 22 Apr 2025 16:43:00 +0100 Subject: [PATCH 041/101] Fixed syntax error --- tests/unit/test_job_file.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/test_job_file.py b/tests/unit/test_job_file.py index 398de048bfb..386869b5c57 100644 --- a/tests/unit/test_job_file.py +++ b/tests/unit/test_job_file.py @@ -411,7 +411,7 @@ def test_write_task_environment(): 'communication method': 'ssh', 'profile': { "activate": "true", - "cgroups path": 'exit_light' + "cgroups path": 'exit_light', "polling interval": 1 } }, From 99f9ae510f05b26f404de393cd32c7b686df713c Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Wed, 23 Apr 2025 16:41:24 +0100 Subject: [PATCH 042/101] Fixed syntax error --- cylc/flow/etc/job.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/cylc/flow/etc/job.sh b/cylc/flow/etc/job.sh index 85e717cb477..f16fd674f4f 100755 --- a/cylc/flow/etc/job.sh +++ b/cylc/flow/etc/job.sh @@ -141,7 +141,6 @@ cylc__job__main() { cd "${CYLC_TASK_WORK_DIR}" if [[ "${CYLC_PROFILE}" == "True" ]] ; then - cylc profile -m "${CYLC_CGROUP}" & cylc profile -m "${CYLC_CGROUP}" -i "${CYLC_POLLING_INTERVAL}" & export profiler_pid="$!" fi From a542523c88ed5bb70f907aa02af33a2bc0f6481e Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Thu, 24 Apr 2025 15:41:48 +0100 Subject: [PATCH 043/101] Fixed the issue where CPU / Max RSS data is not available in the event of job failure --- cylc/flow/etc/job.sh | 16 ++++++++-------- tests/functional/jobscript/02-profiler.t | 4 ++-- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/cylc/flow/etc/job.sh b/cylc/flow/etc/job.sh index f16fd674f4f..bd60a693280 100755 --- a/cylc/flow/etc/job.sh +++ b/cylc/flow/etc/job.sh @@ -164,14 +164,6 @@ cylc__job__main() { fi } # Grab the max rss and cpu_time value before moving directory - if [[ -f "max_rss" ]]; then - max_rss=$(cat max_rss) - rm max_rss - fi - if [[ -f "cpu_time" ]]; then - cpu_time=$(cat cpu_time) - rm cpu_time - fi cylc__kill_profiler # Empty work directory remove cd @@ -208,6 +200,14 @@ cylc__set_return() { ############################################################################### # Save the data using cylc message and exit the profiler cylc__kill_profiler() { + if [[ -f "max_rss" ]]; then + max_rss=$(cat max_rss) + rm max_rss + fi + if [[ -f "cpu_time" ]]; then + cpu_time=$(cat cpu_time) + rm cpu_time + fi if [[ -n "${cpu_time:-}" ]]; then cylc message -- "${CYLC_WORKFLOW_ID}" "${CYLC_TASK_JOB}" "DEBUG: cpu_time $cpu_time" || true fi diff --git a/tests/functional/jobscript/02-profiler.t b/tests/functional/jobscript/02-profiler.t index 9d369e70728..17e852fb384 100644 --- a/tests/functional/jobscript/02-profiler.t +++ b/tests/functional/jobscript/02-profiler.t @@ -28,7 +28,7 @@ create_test_global_config " [[${CYLC_TEST_PLATFORM}]] [[[profile]]] activate = True - # TODO: set the interval to something like 1s + polling interval = 1 [[localhost]] [[[profile]]] activate = True @@ -50,7 +50,7 @@ init_workflow "${TEST_NAME_BASE}" <<'__FLOW_CONFIG__' [[the_bad]] # this task should fail (it should still send profiling info) platform = {{ environ['CYLC_TEST_PLATFORM'] }} - script = false + script = sleep 5; false [[the_ugly]] # this task should succeed despite the broken profiler configuration platform = localhost From 1a633658df97298cb85979431d5b6b89bf451711 Mon Sep 17 00:00:00 2001 From: Christopher Bennett Date: Thu, 24 Apr 2025 15:42:17 +0100 Subject: [PATCH 044/101] Update cylc/flow/cfgspec/globalcfg.py Co-authored-by: Oliver Sanders --- cylc/flow/cfgspec/globalcfg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cylc/flow/cfgspec/globalcfg.py b/cylc/flow/cfgspec/globalcfg.py index b5c19263e0f..10a970e831d 100644 --- a/cylc/flow/cfgspec/globalcfg.py +++ b/cylc/flow/cfgspec/globalcfg.py @@ -1332,7 +1332,7 @@ def default_for( ''') with Conf('profile'): - Conf('activate', VDR.V_BOOLEAN, True, desc=''' + Conf('activate', VDR.V_BOOLEAN, False, desc=''' A Boolean that sets if the cylc profiler will be used .. versionadded:: 8.0.0 From 048606b8c6444f39b2fe9b5f628f385c10145983 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Mon, 28 Apr 2025 10:12:13 +0100 Subject: [PATCH 045/101] Refactored max rss and cpu time data flow --- cylc/flow/etc/job.sh | 29 +++++++++++------------- cylc/flow/scripts/profiler.py | 11 +++++---- tests/functional/jobscript/02-profiler.t | 14 +++--------- 3 files changed, 22 insertions(+), 32 deletions(-) diff --git a/cylc/flow/etc/job.sh b/cylc/flow/etc/job.sh index bd60a693280..178aafa0d89 100755 --- a/cylc/flow/etc/job.sh +++ b/cylc/flow/etc/job.sh @@ -163,8 +163,10 @@ cylc__job__main() { cylc__set_return "$ret_code" fi } - # Grab the max rss and cpu_time value before moving directory - cylc__kill_profiler + # Grab the max rss and cpu_time and clean up before changing directory + if [[ "${CYLC_PROFILE}" == "True" ]] ; then + cylc__kill_profiler + fi # Empty work directory remove cd rmdir "${CYLC_TASK_WORK_DIR}" 2>'/dev/null' || true @@ -200,19 +202,11 @@ cylc__set_return() { ############################################################################### # Save the data using cylc message and exit the profiler cylc__kill_profiler() { - if [[ -f "max_rss" ]]; then - max_rss=$(cat max_rss) - rm max_rss - fi - if [[ -f "cpu_time" ]]; then - cpu_time=$(cat cpu_time) - rm cpu_time - fi - if [[ -n "${cpu_time:-}" ]]; then - cylc message -- "${CYLC_WORKFLOW_ID}" "${CYLC_TASK_JOB}" "DEBUG: cpu_time $cpu_time" || true - fi - if [[ -n "${max_rss:-}" ]]; then - cylc message -- "${CYLC_WORKFLOW_ID}" "${CYLC_TASK_JOB}" "DEBUG: max_rss $max_rss" || true + if [[ -f "profiler.json" ]]; then + max_rss="$(jq -r '.max_rss' profiler.json)" + cpu_time="$(jq -r '.cpu_time' profiler.json)" + cylc message -- "${CYLC_WORKFLOW_ID}" "${CYLC_TASK_JOB}" "DEBUG: cpu_time $cpu_time max_rss $max_rss" || true + rm profiler.json fi if [[ -f "proc/${profiler_pid}" ]]; then kill -s SIGINT "${profiler_pid}" || true @@ -300,7 +294,10 @@ cylc__job_finish_err() { # (Ignore shellcheck "globbing and word splitting" warning here). # shellcheck disable=SC2086 trap '' ${CYLC_VACATION_SIGNALS:-} ${CYLC_FAIL_SIGNALS} - cylc__kill_profiler + + if [[ "${CYLC_PROFILE}" == "True" ]] ; then + cylc__kill_profiler + fi if [[ -n "${CYLC_TASK_MESSAGE_STARTED_PID:-}" ]]; then wait "${CYLC_TASK_MESSAGE_STARTED_PID}" 2>'/dev/null' || true fi diff --git a/cylc/flow/scripts/profiler.py b/cylc/flow/scripts/profiler.py index 3448d29a6be..cde1e1f2763 100755 --- a/cylc/flow/scripts/profiler.py +++ b/cylc/flow/scripts/profiler.py @@ -24,6 +24,7 @@ import re import sys import time +import json import signal from pathlib import Path from dataclasses import dataclass @@ -99,9 +100,11 @@ def parse_cpu_file(cgroup_cpu_path, cgroup_version): return int(line) / 1000000 -def write_data(data, filename): +def write_data(peak_memory, cpu_time, filename): + data = {'max_rss': peak_memory, + 'cpu_time': cpu_time} with open(filename, 'w') as f: - f.write(data) + json.dump(data, f, indent=4) def get_cgroup_version(cgroup_location: str, cgroup_name: str) -> int: @@ -156,13 +159,11 @@ def profile(process, version, delay, keep_looping=lambda: True): while keep_looping(): # Write cpu / memory usage data to disk cpu_time = parse_cpu_file(process.cgroup_cpu_path, version) - write_data(str(cpu_time), "cpu_time") - memory = parse_memory_file(process.cgroup_memory_path) # Only save Max RSS to disk if it is above the previous value if memory > peak_memory: peak_memory = memory - write_data(str(peak_memory), "max_rss") + write_data(str(peak_memory), cpu_time, "profiler.json") time.sleep(delay) diff --git a/tests/functional/jobscript/02-profiler.t b/tests/functional/jobscript/02-profiler.t index 17e852fb384..44d204bdfd5 100644 --- a/tests/functional/jobscript/02-profiler.t +++ b/tests/functional/jobscript/02-profiler.t @@ -21,7 +21,7 @@ # "cgroups path" may need to be set). REQUIRE_PLATFORM='runner:?(pbs|slurm) comms:tcp' . "$(dirname "$0")/test_header" -set_test_number 12 +set_test_number 8 create_test_global_config " [platforms] @@ -64,22 +64,14 @@ workflow_run_ok "${TEST_NAME_BASE}-run" cylc play --debug --no-detach "${WORKFLO # were received before the succeeded message log_scan "${TEST_NAME_BASE}-task-succeeded" \ "${WORKFLOW_RUN_DIR}/log/scheduler/log" 1 0 \ - '1/the_good.*(received)cpu_time.*' \ - '1/the_good.*(received)succeeded' -log_scan "${TEST_NAME_BASE}-task-succeeded" \ - "${WORKFLOW_RUN_DIR}/log/scheduler/log" 1 0 \ - '1/the_good.*(received)max_rss.*' \ + '1/the_good.*(received)cpu_time.*max_rss*' \ '1/the_good.*(received)succeeded' # ensure the cpu and memory messages were received and that these messages # were received before the failed message log_scan "${TEST_NAME_BASE}-task-succeeded" \ "${WORKFLOW_RUN_DIR}/log/scheduler/log" 1 0 \ - '1/the_bad.*(received)cpu_time.*' \ - '1/the_bad.*(received)failed' -log_scan "${TEST_NAME_BASE}-task-succeeded" \ - "${WORKFLOW_RUN_DIR}/log/scheduler/log" 1 0 \ - '1/the_bad.*(received)max_rss.*' \ + '1/the_bad.*(received)cpu_time.*max_rss*' \ '1/the_bad.*(received)failed' # ensure this task succeeded despite the broken profiler configuration From b05c38dc8e90c2518267a7c40aba27f46c846b27 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Mon, 28 Apr 2025 10:27:01 +0100 Subject: [PATCH 046/101] Updating unit tests --- tests/unit/scripts/test_profiler.py | 28 ++++++---------------------- 1 file changed, 6 insertions(+), 22 deletions(-) diff --git a/tests/unit/scripts/test_profiler.py b/tests/unit/scripts/test_profiler.py index db2f97d894d..f8d9a40813d 100644 --- a/tests/unit/scripts/test_profiler.py +++ b/tests/unit/scripts/test_profiler.py @@ -67,8 +67,8 @@ def test_write_data(tmpdir): # Create tmp file file = tmpdir.join('output.txt') - write_data('test_data', file.strpath) - assert file.read() == 'test_data' + write_data('test_memory', 'test_cpu', file.strpath) + assert file.read() == '{\n "max_rss": "test_memory",\n "cpu_time": "test_cpu"\n}' def test_get_cgroup_name(mocker): @@ -126,7 +126,7 @@ def test_get_cgroup_paths(): "test_location/cpu/test_name/cpuacct.usage") -def test_profile_cpu(mocker): +def test_profile_data(mocker): process = get_cgroup_paths(1, "test_location/", "test_name") @@ -138,7 +138,7 @@ def test_profile_cpu(mocker): return_value=2048) run_once = mock.Mock(side_effect=[True, False]) profile(process, 1, 1, run_once) - mock_file.assert_called_with("cpu_time", "w") + mock_file.assert_called_with("profiler.json", "w") def test_stop_profiler(): @@ -148,22 +148,6 @@ def test_stop_profiler(): assert pytest_wrapped_e.value.code == 0 -def test_profile_max_rss(mocker): - process = get_cgroup_paths(1, - "test_location/", - "test_name") - - mock_file = mocker.mock_open(read_data="") - mocker.patch("builtins.open", mock_file) - mocker.patch("cylc.flow.scripts.profiler.parse_memory_file", - return_value=1024) - mocker.patch("cylc.flow.scripts.profiler.parse_cpu_file", - return_value=2048) - run_once = mock.Mock(side_effect=[True, False]) - profile(process, 1, 1, run_once) - mock_file.assert_called_with("max_rss", "w") - - def test_profile_1(mocker): process = get_cgroup_paths( 1, "test_location/", "test_name") @@ -177,7 +161,7 @@ def test_profile_1(mocker): run_once = mock.Mock(side_effect=[True, False]) profile(process, 1, 1, run_once) - mock_file.assert_called_with("max_rss", "w") + mock_file.assert_called_with("profiler.json", "w") def test_profile_2(mocker): @@ -195,7 +179,7 @@ def test_profile_2(mocker): run_once = mock.Mock(side_effect=[True, False]) profile(process, 1, 1, run_once) - mock_file.assert_called_with("cpu_time", "w") + mock_file.assert_called_with("profiler.json", "w") def test_get_config(mocker): From 80963cd6075ed0e24708eb3061cb128ece3def10 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Mon, 28 Apr 2025 11:07:34 +0100 Subject: [PATCH 047/101] Linting --- tests/unit/scripts/test_profiler.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/unit/scripts/test_profiler.py b/tests/unit/scripts/test_profiler.py index f8d9a40813d..1d66cbd53bc 100644 --- a/tests/unit/scripts/test_profiler.py +++ b/tests/unit/scripts/test_profiler.py @@ -68,7 +68,8 @@ def test_write_data(tmpdir): file = tmpdir.join('output.txt') write_data('test_memory', 'test_cpu', file.strpath) - assert file.read() == '{\n "max_rss": "test_memory",\n "cpu_time": "test_cpu"\n}' + assert file.read() == ('{\n "max_rss": "test_memory",\n' + ' "cpu_time": "test_cpu"\n}') def test_get_cgroup_name(mocker): From 5d3cb0c7f80ea39f1df2becc4d064abbc9a809ea Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Mon, 19 May 2025 11:16:53 +0100 Subject: [PATCH 048/101] Linting --- tests/functional/jobscript/02-profiler.t | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/functional/jobscript/02-profiler.t b/tests/functional/jobscript/02-profiler.t index 44d204bdfd5..44391be4ad5 100644 --- a/tests/functional/jobscript/02-profiler.t +++ b/tests/functional/jobscript/02-profiler.t @@ -19,7 +19,7 @@ # NOTE: This test will run the Cylc profiler on the given test platform. # The test platform may need to be configured for this to work (e.g. # "cgroups path" may need to be set). -REQUIRE_PLATFORM='runner:?(pbs|slurm) comms:tcp' +export REQUIRE_PLATFORM='runner:?(pbs|slurm) comms:tcp' . "$(dirname "$0")/test_header" set_test_number 8 From f51794b326b3b0cc5bd284286d70a1bb6a9d6270 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Fri, 20 Jun 2025 09:20:19 +0100 Subject: [PATCH 049/101] Refactoring so that the jq command is not used --- cylc/flow/etc/job.sh | 4 +--- cylc/flow/scripts/profiler.py | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/cylc/flow/etc/job.sh b/cylc/flow/etc/job.sh index 178aafa0d89..af6f5aae16a 100755 --- a/cylc/flow/etc/job.sh +++ b/cylc/flow/etc/job.sh @@ -203,9 +203,7 @@ cylc__set_return() { # Save the data using cylc message and exit the profiler cylc__kill_profiler() { if [[ -f "profiler.json" ]]; then - max_rss="$(jq -r '.max_rss' profiler.json)" - cpu_time="$(jq -r '.cpu_time' profiler.json)" - cylc message -- "${CYLC_WORKFLOW_ID}" "${CYLC_TASK_JOB}" "DEBUG: cpu_time $cpu_time max_rss $max_rss" || true + cylc message -- "${CYLC_WORKFLOW_ID}" "${CYLC_TASK_JOB}" "DEBUG: $(cat profiler.json | tr -d '\n')" || true rm profiler.json fi if [[ -f "proc/${profiler_pid}" ]]; then diff --git a/cylc/flow/scripts/profiler.py b/cylc/flow/scripts/profiler.py index cde1e1f2763..38658f6a40c 100755 --- a/cylc/flow/scripts/profiler.py +++ b/cylc/flow/scripts/profiler.py @@ -163,7 +163,7 @@ def profile(process, version, delay, keep_looping=lambda: True): # Only save Max RSS to disk if it is above the previous value if memory > peak_memory: peak_memory = memory - write_data(str(peak_memory), cpu_time, "profiler.json") + write_data(peak_memory, cpu_time, "profiler.json") time.sleep(delay) From 27a08798b8cb60200e7d9aa3172f1f604057c9dd Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Fri, 20 Jun 2025 09:55:47 +0100 Subject: [PATCH 050/101] Shellchecker linting --- cylc/flow/etc/job.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cylc/flow/etc/job.sh b/cylc/flow/etc/job.sh index af6f5aae16a..abec239f163 100755 --- a/cylc/flow/etc/job.sh +++ b/cylc/flow/etc/job.sh @@ -203,7 +203,7 @@ cylc__set_return() { # Save the data using cylc message and exit the profiler cylc__kill_profiler() { if [[ -f "profiler.json" ]]; then - cylc message -- "${CYLC_WORKFLOW_ID}" "${CYLC_TASK_JOB}" "DEBUG: $(cat profiler.json | tr -d '\n')" || true + cylc message -- "${CYLC_WORKFLOW_ID}" "${CYLC_TASK_JOB}" "DEBUG: $(< profiler.json tr -d '\n')" || true rm profiler.json fi if [[ -f "proc/${profiler_pid}" ]]; then From 6efc7cfa0d0c051ebb4f55b094583ef444e936c1 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Tue, 8 Jul 2025 14:31:31 +0100 Subject: [PATCH 051/101] Linting --- cylc/flow/etc/job.sh | 15 +--- cylc/flow/job_file.py | 3 - cylc/flow/scripts/profiler.py | 89 ++++++++++++++----- .../{02-profiler => 03-profiler}/flow.cylc | 0 4 files changed, 72 insertions(+), 35 deletions(-) rename tests/functional/jobscript/{02-profiler => 03-profiler}/flow.cylc (100%) diff --git a/cylc/flow/etc/job.sh b/cylc/flow/etc/job.sh index abec239f163..640567f4223 100755 --- a/cylc/flow/etc/job.sh +++ b/cylc/flow/etc/job.sh @@ -164,9 +164,7 @@ cylc__job__main() { fi } # Grab the max rss and cpu_time and clean up before changing directory - if [[ "${CYLC_PROFILE}" == "True" ]] ; then - cylc__kill_profiler - fi + cylc__kill_profiler # Empty work directory remove cd rmdir "${CYLC_TASK_WORK_DIR}" 2>'/dev/null' || true @@ -202,11 +200,7 @@ cylc__set_return() { ############################################################################### # Save the data using cylc message and exit the profiler cylc__kill_profiler() { - if [[ -f "profiler.json" ]]; then - cylc message -- "${CYLC_WORKFLOW_ID}" "${CYLC_TASK_JOB}" "DEBUG: $(< profiler.json tr -d '\n')" || true - rm profiler.json - fi - if [[ -f "proc/${profiler_pid}" ]]; then + if [[ -d "/proc/${profiler_pid}" ]]; then kill -s SIGINT "${profiler_pid}" || true fi } @@ -293,9 +287,8 @@ cylc__job_finish_err() { # shellcheck disable=SC2086 trap '' ${CYLC_VACATION_SIGNALS:-} ${CYLC_FAIL_SIGNALS} - if [[ "${CYLC_PROFILE}" == "True" ]] ; then - cylc__kill_profiler - fi + cylc__kill_profiler + if [[ -n "${CYLC_TASK_MESSAGE_STARTED_PID:-}" ]]; then wait "${CYLC_TASK_MESSAGE_STARTED_PID}" 2>'/dev/null' || true fi diff --git a/cylc/flow/job_file.py b/cylc/flow/job_file.py index d89006771c0..cc44aa42e03 100644 --- a/cylc/flow/job_file.py +++ b/cylc/flow/job_file.py @@ -231,9 +231,6 @@ def _write_task_environment(self, handle, job_conf): handle.write( "\n export CYLC_CGROUP=" f"{job_conf['platform']['profile']['cgroups path']}") - handle.write( - "\n export CYLC_CGROUP=" - f"{job_conf['platform']['profile']['cgroups path']}") handle.write( "\n export CYLC_POLLING_INTERVAL=" f"{job_conf['platform']['profile']['polling interval']}") diff --git a/cylc/flow/scripts/profiler.py b/cylc/flow/scripts/profiler.py index 38658f6a40c..08bbb60d7c6 100755 --- a/cylc/flow/scripts/profiler.py +++ b/cylc/flow/scripts/profiler.py @@ -24,21 +24,27 @@ import re import sys import time -import json import signal +import asyncio from pathlib import Path from dataclasses import dataclass from cylc.flow.terminal import cli_function +from cylc.flow.network.client_factory import get_client from cylc.flow.option_parsers import CylcOptionParser as COP INTERNAL = True PID_REGEX = re.compile(r"([^:]*\d{6,}.*)") RE_INT = re.compile(r'\d+') +max_rss_location = None +cpu_time_location = None +cgroup_version = None +comms_timeout = None def get_option_parser() -> COP: parser = COP( __doc__, + comms=True, argdoc=[ ], ) @@ -55,11 +61,14 @@ def get_option_parser() -> COP: @cli_function(get_option_parser) def main(parser: COP, options) -> None: """CLI main.""" + global comms_timeout # Register the stop_profiler function with the signal library signal.signal(signal.SIGINT, stop_profiler) signal.signal(signal.SIGHUP, stop_profiler) signal.signal(signal.SIGTERM, stop_profiler) + comms_timeout = options.comms_timeout + get_config(options) @@ -73,7 +82,49 @@ class Process: def stop_profiler(*args): """This function will be executed when the SIGINT signal is sent to this process""" - print('profiler exited') + + global max_rss_location + global cpu_time_location + global cgroup_version + global comms_timeout + + # If a task fails instantly, or finishes very quickly (< 1 second), + # the get config function doesn't have time to run + if (max_rss_location is None + or cpu_time_location is None + or cgroup_version is None): + max_rss = 0 + cpu_time = 0 + else: + max_rss = parse_memory_file(max_rss_location) + cpu_time = parse_cpu_file(cpu_time_location, cgroup_version) + + GRAPHQL_MUTATION = """ + mutation($WORKFLOWS: [WorkflowID]!, $MESSAGES: [[String]], $JOB: String!, $TIME: String) { + message(workflows: $WORKFLOWS, messages:$MESSAGES, taskJob:$JOB, eventTime:$TIME) { + result + } + } + """ + + GRAPHQL_REQUEST_VARIABLES = { + "WORKFLOWS": [os.environ.get('CYLC_WORKFLOW_ID')], + "MESSAGES": [["DEBUG", f"cpu_time {cpu_time} max_rss {max_rss}"]], + "JOB": os.environ.get('CYLC_TASK_JOB'), + "TIME": "now" + } + + pclient = get_client(os.environ.get('CYLC_WORKFLOW_ID'), + timeout=comms_timeout) + + async def send_cylc_message(): + await pclient.async_request( + 'graphql', + {'request_string': GRAPHQL_MUTATION, + 'variables': GRAPHQL_REQUEST_VARIABLES}, + ) + + asyncio.run(send_cylc_message()) sys.exit(0) @@ -100,21 +151,18 @@ def parse_cpu_file(cgroup_cpu_path, cgroup_version): return int(line) / 1000000 -def write_data(peak_memory, cpu_time, filename): - data = {'max_rss': peak_memory, - 'cpu_time': cpu_time} - with open(filename, 'w') as f: - json.dump(data, f, indent=4) - - def get_cgroup_version(cgroup_location: str, cgroup_name: str) -> int: # HPC uses cgroups v2 and SPICE uses cgroups v1 + global cgroup_version if Path.exists(Path(cgroup_location + cgroup_name)): - return 1 + cgroup_version = 1 + return cgroup_version elif Path.exists(Path(cgroup_location + "/memory" + cgroup_name)): - return 2 + cgroup_version = 2 + return cgroup_version else: - raise FileNotFoundError("Cgroup not found") + raise FileNotFoundError("Cgroup not found at " + + cgroup_location + cgroup_name) def get_cgroup_name(): @@ -136,8 +184,11 @@ def get_cgroup_name(): def get_cgroup_paths(version, location, name): - + global max_rss_location + global cpu_time_location if version == 1: + max_rss_location = location + name + "/" + "memory.peak" + cpu_time_location = location + name + "/" + "cpu.stat" return Process( cgroup_memory_path=location + name + "/" + "memory.peak", @@ -145,6 +196,8 @@ def get_cgroup_paths(version, location, name): name + "/" + "cpu.stat") elif version == 2: + max_rss_location = location + "/memory" + name + "/memory.max_usage_in_bytes" + cpu_time_location = location + "/cpu" + name + "/cpuacct.usage" return Process( cgroup_memory_path=location + "/memory" + name + "/memory.max_usage_in_bytes", @@ -155,16 +208,10 @@ def get_cgroup_paths(version, location, name): def profile(process, version, delay, keep_looping=lambda: True): # The infinite loop that will constantly poll the cgroup # The lambda function is used to allow the loop to be stopped in unit tests - peak_memory = 0 + while keep_looping(): # Write cpu / memory usage data to disk - cpu_time = parse_cpu_file(process.cgroup_cpu_path, version) - memory = parse_memory_file(process.cgroup_memory_path) - # Only save Max RSS to disk if it is above the previous value - if memory > peak_memory: - peak_memory = memory - write_data(peak_memory, cpu_time, "profiler.json") - + # CPU_TIME = parse_cpu_file(process.cgroup_cpu_path, version) time.sleep(delay) diff --git a/tests/functional/jobscript/02-profiler/flow.cylc b/tests/functional/jobscript/03-profiler/flow.cylc similarity index 100% rename from tests/functional/jobscript/02-profiler/flow.cylc rename to tests/functional/jobscript/03-profiler/flow.cylc From 9a5a4fa1bd7db61cf54e52065a682393b57617e6 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Tue, 8 Jul 2025 14:31:31 +0100 Subject: [PATCH 052/101] Removing json usage Removing json usage --- cylc/flow/etc/job.sh | 15 +--- cylc/flow/job_file.py | 3 - cylc/flow/scripts/profiler.py | 89 ++++++++++++++----- .../{02-profiler => 03-profiler}/flow.cylc | 0 4 files changed, 72 insertions(+), 35 deletions(-) rename tests/functional/jobscript/{02-profiler => 03-profiler}/flow.cylc (100%) diff --git a/cylc/flow/etc/job.sh b/cylc/flow/etc/job.sh index abec239f163..640567f4223 100755 --- a/cylc/flow/etc/job.sh +++ b/cylc/flow/etc/job.sh @@ -164,9 +164,7 @@ cylc__job__main() { fi } # Grab the max rss and cpu_time and clean up before changing directory - if [[ "${CYLC_PROFILE}" == "True" ]] ; then - cylc__kill_profiler - fi + cylc__kill_profiler # Empty work directory remove cd rmdir "${CYLC_TASK_WORK_DIR}" 2>'/dev/null' || true @@ -202,11 +200,7 @@ cylc__set_return() { ############################################################################### # Save the data using cylc message and exit the profiler cylc__kill_profiler() { - if [[ -f "profiler.json" ]]; then - cylc message -- "${CYLC_WORKFLOW_ID}" "${CYLC_TASK_JOB}" "DEBUG: $(< profiler.json tr -d '\n')" || true - rm profiler.json - fi - if [[ -f "proc/${profiler_pid}" ]]; then + if [[ -d "/proc/${profiler_pid}" ]]; then kill -s SIGINT "${profiler_pid}" || true fi } @@ -293,9 +287,8 @@ cylc__job_finish_err() { # shellcheck disable=SC2086 trap '' ${CYLC_VACATION_SIGNALS:-} ${CYLC_FAIL_SIGNALS} - if [[ "${CYLC_PROFILE}" == "True" ]] ; then - cylc__kill_profiler - fi + cylc__kill_profiler + if [[ -n "${CYLC_TASK_MESSAGE_STARTED_PID:-}" ]]; then wait "${CYLC_TASK_MESSAGE_STARTED_PID}" 2>'/dev/null' || true fi diff --git a/cylc/flow/job_file.py b/cylc/flow/job_file.py index d89006771c0..cc44aa42e03 100644 --- a/cylc/flow/job_file.py +++ b/cylc/flow/job_file.py @@ -231,9 +231,6 @@ def _write_task_environment(self, handle, job_conf): handle.write( "\n export CYLC_CGROUP=" f"{job_conf['platform']['profile']['cgroups path']}") - handle.write( - "\n export CYLC_CGROUP=" - f"{job_conf['platform']['profile']['cgroups path']}") handle.write( "\n export CYLC_POLLING_INTERVAL=" f"{job_conf['platform']['profile']['polling interval']}") diff --git a/cylc/flow/scripts/profiler.py b/cylc/flow/scripts/profiler.py index 38658f6a40c..08bbb60d7c6 100755 --- a/cylc/flow/scripts/profiler.py +++ b/cylc/flow/scripts/profiler.py @@ -24,21 +24,27 @@ import re import sys import time -import json import signal +import asyncio from pathlib import Path from dataclasses import dataclass from cylc.flow.terminal import cli_function +from cylc.flow.network.client_factory import get_client from cylc.flow.option_parsers import CylcOptionParser as COP INTERNAL = True PID_REGEX = re.compile(r"([^:]*\d{6,}.*)") RE_INT = re.compile(r'\d+') +max_rss_location = None +cpu_time_location = None +cgroup_version = None +comms_timeout = None def get_option_parser() -> COP: parser = COP( __doc__, + comms=True, argdoc=[ ], ) @@ -55,11 +61,14 @@ def get_option_parser() -> COP: @cli_function(get_option_parser) def main(parser: COP, options) -> None: """CLI main.""" + global comms_timeout # Register the stop_profiler function with the signal library signal.signal(signal.SIGINT, stop_profiler) signal.signal(signal.SIGHUP, stop_profiler) signal.signal(signal.SIGTERM, stop_profiler) + comms_timeout = options.comms_timeout + get_config(options) @@ -73,7 +82,49 @@ class Process: def stop_profiler(*args): """This function will be executed when the SIGINT signal is sent to this process""" - print('profiler exited') + + global max_rss_location + global cpu_time_location + global cgroup_version + global comms_timeout + + # If a task fails instantly, or finishes very quickly (< 1 second), + # the get config function doesn't have time to run + if (max_rss_location is None + or cpu_time_location is None + or cgroup_version is None): + max_rss = 0 + cpu_time = 0 + else: + max_rss = parse_memory_file(max_rss_location) + cpu_time = parse_cpu_file(cpu_time_location, cgroup_version) + + GRAPHQL_MUTATION = """ + mutation($WORKFLOWS: [WorkflowID]!, $MESSAGES: [[String]], $JOB: String!, $TIME: String) { + message(workflows: $WORKFLOWS, messages:$MESSAGES, taskJob:$JOB, eventTime:$TIME) { + result + } + } + """ + + GRAPHQL_REQUEST_VARIABLES = { + "WORKFLOWS": [os.environ.get('CYLC_WORKFLOW_ID')], + "MESSAGES": [["DEBUG", f"cpu_time {cpu_time} max_rss {max_rss}"]], + "JOB": os.environ.get('CYLC_TASK_JOB'), + "TIME": "now" + } + + pclient = get_client(os.environ.get('CYLC_WORKFLOW_ID'), + timeout=comms_timeout) + + async def send_cylc_message(): + await pclient.async_request( + 'graphql', + {'request_string': GRAPHQL_MUTATION, + 'variables': GRAPHQL_REQUEST_VARIABLES}, + ) + + asyncio.run(send_cylc_message()) sys.exit(0) @@ -100,21 +151,18 @@ def parse_cpu_file(cgroup_cpu_path, cgroup_version): return int(line) / 1000000 -def write_data(peak_memory, cpu_time, filename): - data = {'max_rss': peak_memory, - 'cpu_time': cpu_time} - with open(filename, 'w') as f: - json.dump(data, f, indent=4) - - def get_cgroup_version(cgroup_location: str, cgroup_name: str) -> int: # HPC uses cgroups v2 and SPICE uses cgroups v1 + global cgroup_version if Path.exists(Path(cgroup_location + cgroup_name)): - return 1 + cgroup_version = 1 + return cgroup_version elif Path.exists(Path(cgroup_location + "/memory" + cgroup_name)): - return 2 + cgroup_version = 2 + return cgroup_version else: - raise FileNotFoundError("Cgroup not found") + raise FileNotFoundError("Cgroup not found at " + + cgroup_location + cgroup_name) def get_cgroup_name(): @@ -136,8 +184,11 @@ def get_cgroup_name(): def get_cgroup_paths(version, location, name): - + global max_rss_location + global cpu_time_location if version == 1: + max_rss_location = location + name + "/" + "memory.peak" + cpu_time_location = location + name + "/" + "cpu.stat" return Process( cgroup_memory_path=location + name + "/" + "memory.peak", @@ -145,6 +196,8 @@ def get_cgroup_paths(version, location, name): name + "/" + "cpu.stat") elif version == 2: + max_rss_location = location + "/memory" + name + "/memory.max_usage_in_bytes" + cpu_time_location = location + "/cpu" + name + "/cpuacct.usage" return Process( cgroup_memory_path=location + "/memory" + name + "/memory.max_usage_in_bytes", @@ -155,16 +208,10 @@ def get_cgroup_paths(version, location, name): def profile(process, version, delay, keep_looping=lambda: True): # The infinite loop that will constantly poll the cgroup # The lambda function is used to allow the loop to be stopped in unit tests - peak_memory = 0 + while keep_looping(): # Write cpu / memory usage data to disk - cpu_time = parse_cpu_file(process.cgroup_cpu_path, version) - memory = parse_memory_file(process.cgroup_memory_path) - # Only save Max RSS to disk if it is above the previous value - if memory > peak_memory: - peak_memory = memory - write_data(peak_memory, cpu_time, "profiler.json") - + # CPU_TIME = parse_cpu_file(process.cgroup_cpu_path, version) time.sleep(delay) diff --git a/tests/functional/jobscript/02-profiler/flow.cylc b/tests/functional/jobscript/03-profiler/flow.cylc similarity index 100% rename from tests/functional/jobscript/02-profiler/flow.cylc rename to tests/functional/jobscript/03-profiler/flow.cylc From 8f7c419f5eb017149731e0f33f4826e5358b8ce9 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Wed, 9 Jul 2025 09:43:45 +0100 Subject: [PATCH 053/101] Adding e2e functional tests --- cylc/flow/scripts/profiler.py | 5 ++ .../{02-profiler.t => 03-profiler.t} | 7 +- tests/functional/jobscript/04-profiler-e2e.t | 85 +++++++++++++++++++ .../jobscript/04-profiler-e2e/flow.cylc | 0 tests/unit/scripts/test_profiler.py | 84 +----------------- 5 files changed, 95 insertions(+), 86 deletions(-) rename tests/functional/jobscript/{02-profiler.t => 03-profiler.t} (95%) create mode 100644 tests/functional/jobscript/04-profiler-e2e.t create mode 100644 tests/functional/jobscript/04-profiler-e2e/flow.cylc diff --git a/cylc/flow/scripts/profiler.py b/cylc/flow/scripts/profiler.py index 08bbb60d7c6..192b31be97b 100755 --- a/cylc/flow/scripts/profiler.py +++ b/cylc/flow/scripts/profiler.py @@ -167,6 +167,11 @@ def get_cgroup_version(cgroup_location: str, cgroup_name: str) -> int: def get_cgroup_name(): """Get the cgroup directory for the current process""" + + # fugly hack to allow functional tests to use test data + if 'profiler_test_env_var' in os.environ: + return os.getenv('profiler_test_env_var') + # Get the PID of the current process pid = os.getpid() try: diff --git a/tests/functional/jobscript/02-profiler.t b/tests/functional/jobscript/03-profiler.t similarity index 95% rename from tests/functional/jobscript/02-profiler.t rename to tests/functional/jobscript/03-profiler.t index 44391be4ad5..2a910e6705c 100644 --- a/tests/functional/jobscript/02-profiler.t +++ b/tests/functional/jobscript/03-profiler.t @@ -28,11 +28,12 @@ create_test_global_config " [[${CYLC_TEST_PLATFORM}]] [[[profile]]] activate = True - polling interval = 1 + polling interval = 10 [[localhost]] [[[profile]]] activate = True - cgroups path = /no/such/path + polling interval = 10 + cgroups path = the/thing/that/should/not/be " init_workflow "${TEST_NAME_BASE}" <<'__FLOW_CONFIG__' @@ -54,7 +55,7 @@ init_workflow "${TEST_NAME_BASE}" <<'__FLOW_CONFIG__' [[the_ugly]] # this task should succeed despite the broken profiler configuration platform = localhost - script = sleep 1 + script = sleep 5 __FLOW_CONFIG__ run_ok "${TEST_NAME_BASE}-validate" cylc validate "${WORKFLOW_NAME}" diff --git a/tests/functional/jobscript/04-profiler-e2e.t b/tests/functional/jobscript/04-profiler-e2e.t new file mode 100644 index 00000000000..9230af2ae7e --- /dev/null +++ b/tests/functional/jobscript/04-profiler-e2e.t @@ -0,0 +1,85 @@ +#!/usr/bin/env bash +# THIS FILE IS PART OF THE CYLC WORKFLOW ENGINE. +# Copyright (C) NIWA & British Crown (Met Office) & Contributors. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +#------------------------------------------------------------------------------- +# Cylc profile test +# NOTE: This test will run the Cylc profiler on the given test platform. +# The test platform may need to be configured for this to work (e.g. +# "cgroups path" may need to be set). + +. "$(dirname "$0")/test_header" +set_test_number 7 + +mkdir -p "${PWD}/cgroups_test_data" + +echo '12345678' > cgroups_test_data/memory.peak +mem_path="${PWD}/cgroups_test_data/memory.peak" + +echo 'usage_usec 56781234' > cgroups_test_data/cpu.stat +cpu_path="${PWD}/cgroups_test_data/cpu.stat" + +export profiler_test_env_var='/cgroups_test_data' +create_test_global_config " +[platforms] + [[localhost]] + [[[profile]]] + activate = True + cgroups path = ${PWD} +" + +init_workflow "${TEST_NAME_BASE}" <<'__FLOW_CONFIG__' +#!Jinja2 + +[scheduling] + [[graph]] + R1 = the_good & the_bad? & the_ugly + +[runtime] + [[the_good]] + # this task should succeeded normally + platform = localhost + script = sleep 1 + [[the_bad]] + # this task should fail (it should still send profiling info) + platform = localhost + script = sleep 5; false + [[the_ugly]] + # this task should succeed despite the broken profiler configuration + platform = localhost + script = sleep 1 +__FLOW_CONFIG__ + +run_ok "${TEST_NAME_BASE}-validate" cylc validate "${WORKFLOW_NAME}" +workflow_run_ok "${TEST_NAME_BASE}-run" cylc play --debug --no-detach "${WORKFLOW_NAME}" + +# ensure the cpu and memory messages were received and that these messages +# were received before the succeeded message +log_scan "${TEST_NAME_BASE}-task-succeeded" \ + "${WORKFLOW_RUN_DIR}/log/scheduler/log" 1 0 \ + '1/the_good.*(received)cpu_time.*max_rss*' \ + '1/the_good.*(received)succeeded' + +# ensure the cpu and memory messages were received and that these messages +# were received before the failed message +log_scan "${TEST_NAME_BASE}-task-succeeded" \ + "${WORKFLOW_RUN_DIR}/log/scheduler/log" 1 0 \ + '1/the_bad.*(received)cpu_time.*max_rss*' \ + '1/the_bad.*failed' + +# ensure this task succeeded despite the broken profiler configuration +grep_workflow_log_ok "${TEST_NAME_BASE}-broken" '1/the_ugly.*(received)succeeded' + +purge diff --git a/tests/functional/jobscript/04-profiler-e2e/flow.cylc b/tests/functional/jobscript/04-profiler-e2e/flow.cylc new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/unit/scripts/test_profiler.py b/tests/unit/scripts/test_profiler.py index 1d66cbd53bc..a4a6816a683 100644 --- a/tests/unit/scripts/test_profiler.py +++ b/tests/unit/scripts/test_profiler.py @@ -17,7 +17,6 @@ # Tests for functions contained in cylc.flow.scripts.profiler from cylc.flow.scripts.profiler import (parse_memory_file, parse_cpu_file, - write_data, get_cgroup_name, get_cgroup_version, get_cgroup_paths, @@ -63,15 +62,6 @@ def test_parse_cpu_file(mocker): mock_file.assert_called_once_with("mocked_file.txt", "r") -def test_write_data(tmpdir): - # Create tmp file - file = tmpdir.join('output.txt') - - write_data('test_memory', 'test_cpu', file.strpath) - assert file.read() == ('{\n "max_rss": "test_memory",\n' - ' "cpu_time": "test_cpu"\n}') - - def test_get_cgroup_name(mocker): mock_file = mocker.mock_open(read_data="0::bad/test/cgroup/place") @@ -128,6 +118,7 @@ def test_get_cgroup_paths(): def test_profile_data(mocker): + # This test should run without error process = get_cgroup_paths(1, "test_location/", "test_name") @@ -139,76 +130,3 @@ def test_profile_data(mocker): return_value=2048) run_once = mock.Mock(side_effect=[True, False]) profile(process, 1, 1, run_once) - mock_file.assert_called_with("profiler.json", "w") - - -def test_stop_profiler(): - with pytest.raises(SystemExit) as pytest_wrapped_e: - stop_profiler() - assert pytest_wrapped_e.type == SystemExit - assert pytest_wrapped_e.value.code == 0 - - -def test_profile_1(mocker): - process = get_cgroup_paths( - 1, "test_location/", "test_name") - - mock_file = mocker.mock_open(read_data="") - mocker.patch("builtins.open", mock_file) - mocker.patch( - "cylc.flow.scripts.profiler.parse_memory_file", return_value=1024) - mocker.patch( - "cylc.flow.scripts.profiler.parse_cpu_file", return_value=2048) - run_once = mock.Mock(side_effect=[True, False]) - - profile(process, 1, 1, run_once) - mock_file.assert_called_with("profiler.json", "w") - - -def test_profile_2(mocker): - # assert_called_with only shows the last call to open(). - # Setting peak memory to zero stops the memory call to open - process = get_cgroup_paths( - 1, "test_location/", "test_name") - - mock_file = mocker.mock_open(read_data="") - mocker.patch("builtins.open", mock_file) - mocker.patch( - "cylc.flow.scripts.profiler.parse_cpu_file", return_value=2048) - mocker.patch( - "cylc.flow.scripts.profiler.parse_memory_file", return_value=0) - run_once = mock.Mock(side_effect=[True, False]) - - profile(process, 1, 1, run_once) - mock_file.assert_called_with("profiler.json", "w") - - -def test_get_config(mocker): - - # Mock the 'open' function call to return a valid string. - mock_file = mocker.mock_open(read_data="0::good/cgroup/place/2222222") - mocker.patch("builtins.open", mock_file) - - # Mock the get_cgroup_version function so it says the cgroup path is valid - mocker.patch("cylc.flow.scripts.profiler.get_cgroup_version", - return_value=1) - # Mock the parse functions so they return valid values - mocker.patch("cylc.flow.scripts.profiler.parse_memory_file", - return_value=1024) - mocker.patch("cylc.flow.scripts.profiler.parse_cpu_file", - return_value=2048) - - # Mock the write_data function to simulate writing data. - # It will error out on the 3rd call - mock_write = mock.Mock( - side_effect=[None, None, FileNotFoundError('Carpe Diem')]) - mocker.patch("cylc.flow.scripts.profiler.write_data", mock_write) - - parser = argparse.ArgumentParser() - parser.add_argument( - "-i", type=int, default=10, dest="delay") - parser.add_argument( - "-m", type=str, default="test_location/", - dest="cgroup_location") - with pytest.raises(FileNotFoundError): - get_config(parser.parse_args()) From 6f8c3f3209955276ee449ab1be8fabfb8334f86c Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Wed, 9 Jul 2025 10:23:17 +0100 Subject: [PATCH 054/101] Linting --- cylc/flow/scripts/profiler.py | 17 ++++++++--------- tests/unit/scripts/test_profiler.py | 3 --- 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/cylc/flow/scripts/profiler.py b/cylc/flow/scripts/profiler.py index 192b31be97b..61eaaf6d8e9 100755 --- a/cylc/flow/scripts/profiler.py +++ b/cylc/flow/scripts/profiler.py @@ -83,11 +83,6 @@ def stop_profiler(*args): """This function will be executed when the SIGINT signal is sent to this process""" - global max_rss_location - global cpu_time_location - global cgroup_version - global comms_timeout - # If a task fails instantly, or finishes very quickly (< 1 second), # the get config function doesn't have time to run if (max_rss_location is None @@ -100,8 +95,10 @@ def stop_profiler(*args): cpu_time = parse_cpu_file(cpu_time_location, cgroup_version) GRAPHQL_MUTATION = """ - mutation($WORKFLOWS: [WorkflowID]!, $MESSAGES: [[String]], $JOB: String!, $TIME: String) { - message(workflows: $WORKFLOWS, messages:$MESSAGES, taskJob:$JOB, eventTime:$TIME) { + mutation($WORKFLOWS: [WorkflowID]!, + $MESSAGES: [[String]], $JOB: String!, $TIME: String) { + message(workflows: $WORKFLOWS, messages:$MESSAGES, + taskJob:$JOB, eventTime:$TIME) { result } } @@ -201,8 +198,10 @@ def get_cgroup_paths(version, location, name): name + "/" + "cpu.stat") elif version == 2: - max_rss_location = location + "/memory" + name + "/memory.max_usage_in_bytes" - cpu_time_location = location + "/cpu" + name + "/cpuacct.usage" + max_rss_location = (location + "/memory" + + name + "/memory.max_usage_in_bytes") + cpu_time_location = (location + "/cpu" + + name + "/cpuacct.usage") return Process( cgroup_memory_path=location + "/memory" + name + "/memory.max_usage_in_bytes", diff --git a/tests/unit/scripts/test_profiler.py b/tests/unit/scripts/test_profiler.py index a4a6816a683..2903580a9f5 100644 --- a/tests/unit/scripts/test_profiler.py +++ b/tests/unit/scripts/test_profiler.py @@ -20,11 +20,8 @@ get_cgroup_name, get_cgroup_version, get_cgroup_paths, - get_config, - stop_profiler, profile) import pytest -import argparse from unittest import mock From 2e022866833a54590f2ec1dc860e43eab95c31ec Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Wed, 9 Jul 2025 10:24:53 +0100 Subject: [PATCH 055/101] Linting --- cylc/flow/scripts/profiler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cylc/flow/scripts/profiler.py b/cylc/flow/scripts/profiler.py index 61eaaf6d8e9..84547d01793 100755 --- a/cylc/flow/scripts/profiler.py +++ b/cylc/flow/scripts/profiler.py @@ -95,9 +95,9 @@ def stop_profiler(*args): cpu_time = parse_cpu_file(cpu_time_location, cgroup_version) GRAPHQL_MUTATION = """ - mutation($WORKFLOWS: [WorkflowID]!, + mutation($WORKFLOWS: [WorkflowID]!, $MESSAGES: [[String]], $JOB: String!, $TIME: String) { - message(workflows: $WORKFLOWS, messages:$MESSAGES, + message(workflows: $WORKFLOWS, messages:$MESSAGES, taskJob:$JOB, eventTime:$TIME) { result } From 2923917fe556c4d4b52a6dce153199b7a9982410 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Wed, 9 Jul 2025 10:35:12 +0100 Subject: [PATCH 056/101] Fixing functional tests --- tests/functional/jobscript/04-profiler-e2e.t | 3 --- tests/unit/test_job_file.py | 1 - 2 files changed, 4 deletions(-) diff --git a/tests/functional/jobscript/04-profiler-e2e.t b/tests/functional/jobscript/04-profiler-e2e.t index 9230af2ae7e..e7977c8b0ff 100644 --- a/tests/functional/jobscript/04-profiler-e2e.t +++ b/tests/functional/jobscript/04-profiler-e2e.t @@ -26,10 +26,7 @@ set_test_number 7 mkdir -p "${PWD}/cgroups_test_data" echo '12345678' > cgroups_test_data/memory.peak -mem_path="${PWD}/cgroups_test_data/memory.peak" - echo 'usage_usec 56781234' > cgroups_test_data/cpu.stat -cpu_path="${PWD}/cgroups_test_data/cpu.stat" export profiler_test_env_var='/cgroups_test_data' create_test_global_config " diff --git a/tests/unit/test_job_file.py b/tests/unit/test_job_file.py index 386869b5c57..3ccd8d2923c 100644 --- a/tests/unit/test_job_file.py +++ b/tests/unit/test_job_file.py @@ -401,7 +401,6 @@ def test_write_task_environment(): 'CYLC_TASK_FLOW_NUMBERS=1\n export ' 'CYLC_PROFILE=true\n export ' 'CYLC_CGROUP=exit_light\n export ' - 'CYLC_CGROUP=exit_light\n export ' 'CYLC_POLLING_INTERVAL=1\n export ' 'CYLC_TASK_PARAM_duck="quack"\n export ' 'CYLC_TASK_PARAM_mouse="squeak"\n ' From 15c29fa9f15240346b013489f1e9e7b2ce4c5500 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Wed, 9 Jul 2025 13:25:51 +0100 Subject: [PATCH 057/101] Kill profiler more reliably --- cylc/flow/etc/job.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cylc/flow/etc/job.sh b/cylc/flow/etc/job.sh index 640567f4223..3cf7851eb73 100755 --- a/cylc/flow/etc/job.sh +++ b/cylc/flow/etc/job.sh @@ -200,7 +200,7 @@ cylc__set_return() { ############################################################################### # Save the data using cylc message and exit the profiler cylc__kill_profiler() { - if [[ -d "/proc/${profiler_pid}" ]]; then + if [[ -n "${profiler_pid:-}" && -d "/proc/${profiler_pid}" ]]; then kill -s SIGINT "${profiler_pid}" || true fi } From c09aa6f0863f1d6e74b8fc7e3d62816161fdcdff Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Thu, 10 Jul 2025 13:21:29 +0100 Subject: [PATCH 058/101] Added unit test coverage --- tests/functional/jobscript/04-profiler-e2e.t | 2 +- tests/unit/scripts/test_profiler.py | 26 ++++++++++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/tests/functional/jobscript/04-profiler-e2e.t b/tests/functional/jobscript/04-profiler-e2e.t index e7977c8b0ff..dda04ffc1a0 100644 --- a/tests/functional/jobscript/04-profiler-e2e.t +++ b/tests/functional/jobscript/04-profiler-e2e.t @@ -26,7 +26,7 @@ set_test_number 7 mkdir -p "${PWD}/cgroups_test_data" echo '12345678' > cgroups_test_data/memory.peak -echo 'usage_usec 56781234' > cgroups_test_data/cpu.stat +echo "blah blah 123456\nusage_usec 56781234" > cgroups_test_data/cpu.stat export profiler_test_env_var='/cgroups_test_data' create_test_global_config " diff --git a/tests/unit/scripts/test_profiler.py b/tests/unit/scripts/test_profiler.py index 2903580a9f5..cec4e979b5a 100644 --- a/tests/unit/scripts/test_profiler.py +++ b/tests/unit/scripts/test_profiler.py @@ -20,11 +20,37 @@ get_cgroup_name, get_cgroup_version, get_cgroup_paths, + stop_profiler, profile) import pytest from unittest import mock +def test_stop_profiler(mocker, monkeypatch): + monkeypatch.setenv('CYLC_WORKFLOW_ID', "test_value") + + def mock_get_client(env_var, timeout=None): + return True + + class MockedClient(): + def __init__(self, *a, **k): pass + + async def async_request(self, *a, **k): pass + + mocker.patch("cylc.flow.scripts.profiler.get_client", MockedClient) + max_rss_location = None + cpu_time_location = None + cgroup_version = 1 + + with pytest.raises(SystemExit) as excinfo: + stop_profiler(max_rss_location, cpu_time_location, cgroup_version) + assert stop_profiler.max_rss == 0 + assert stop_profiler.cpu_time == 0 + + assert excinfo.type == SystemExit + assert excinfo.value.code == 0 + + def test_parse_memory_file(mocker): with pytest.raises(FileNotFoundError): From a29cf82e77c7918b93d228c1b84911d9ba0dbb39 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Thu, 10 Jul 2025 13:23:31 +0100 Subject: [PATCH 059/101] linting --- tests/unit/scripts/test_profiler.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/unit/scripts/test_profiler.py b/tests/unit/scripts/test_profiler.py index cec4e979b5a..f4e7af6beff 100644 --- a/tests/unit/scripts/test_profiler.py +++ b/tests/unit/scripts/test_profiler.py @@ -33,9 +33,11 @@ def mock_get_client(env_var, timeout=None): return True class MockedClient(): - def __init__(self, *a, **k): pass + def __init__(self, *a, **k): + pass - async def async_request(self, *a, **k): pass + async def async_request(self, *a, **k): + pass mocker.patch("cylc.flow.scripts.profiler.get_client", MockedClient) max_rss_location = None From 172b45371efeee8586f4f2fabf58d4e23d7c5a75 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Thu, 10 Jul 2025 13:28:52 +0100 Subject: [PATCH 060/101] linting --- tests/functional/jobscript/04-profiler-e2e.t | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/functional/jobscript/04-profiler-e2e.t b/tests/functional/jobscript/04-profiler-e2e.t index dda04ffc1a0..ec48664453b 100644 --- a/tests/functional/jobscript/04-profiler-e2e.t +++ b/tests/functional/jobscript/04-profiler-e2e.t @@ -26,7 +26,7 @@ set_test_number 7 mkdir -p "${PWD}/cgroups_test_data" echo '12345678' > cgroups_test_data/memory.peak -echo "blah blah 123456\nusage_usec 56781234" > cgroups_test_data/cpu.stat +printf "blah blah 123456\nusage_usec 56781234" > cgroups_test_data/cpu.stat export profiler_test_env_var='/cgroups_test_data' create_test_global_config " From 59200fc896044ef3304392a41faabb7187f36ab3 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Mon, 21 Jul 2025 10:11:40 +0100 Subject: [PATCH 061/101] Corrected cgroup versions --- cylc/flow/scripts/profiler.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cylc/flow/scripts/profiler.py b/cylc/flow/scripts/profiler.py index 84547d01793..e6a1fc1a7fa 100755 --- a/cylc/flow/scripts/profiler.py +++ b/cylc/flow/scripts/profiler.py @@ -136,12 +136,12 @@ def parse_memory_file(cgroup_memory_path): def parse_cpu_file(cgroup_cpu_path, cgroup_version): """Open the memory stat file and return the appropriate data""" - if cgroup_version == 1: + if cgroup_version == 2: with open(cgroup_cpu_path, 'r') as f: for line in f: if "usage_usec" in line: return int(RE_INT.findall(line)[0]) // 1000 - elif cgroup_version == 2: + elif cgroup_version == 1: with open(cgroup_cpu_path, 'r') as f: for line in f: # Cgroups v2 uses nanoseconds @@ -152,10 +152,10 @@ def get_cgroup_version(cgroup_location: str, cgroup_name: str) -> int: # HPC uses cgroups v2 and SPICE uses cgroups v1 global cgroup_version if Path.exists(Path(cgroup_location + cgroup_name)): - cgroup_version = 1 + cgroup_version = 2 return cgroup_version elif Path.exists(Path(cgroup_location + "/memory" + cgroup_name)): - cgroup_version = 2 + cgroup_version = 1 return cgroup_version else: raise FileNotFoundError("Cgroup not found at " + @@ -188,7 +188,7 @@ def get_cgroup_name(): def get_cgroup_paths(version, location, name): global max_rss_location global cpu_time_location - if version == 1: + if version == 2: max_rss_location = location + name + "/" + "memory.peak" cpu_time_location = location + name + "/" + "cpu.stat" return Process( @@ -197,7 +197,7 @@ def get_cgroup_paths(version, location, name): cgroup_cpu_path=location + name + "/" + "cpu.stat") - elif version == 2: + elif version == 1: max_rss_location = (location + "/memory" + name + "/memory.max_usage_in_bytes") cpu_time_location = (location + "/cpu" + From 9189ac355413b2b0741039c8cb337de44c746343 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Fri, 25 Jul 2025 11:50:00 +0100 Subject: [PATCH 062/101] Updating unit tests --- tests/unit/scripts/test_profiler.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/unit/scripts/test_profiler.py b/tests/unit/scripts/test_profiler.py index f4e7af6beff..2bc06dc8d2a 100644 --- a/tests/unit/scripts/test_profiler.py +++ b/tests/unit/scripts/test_profiler.py @@ -71,7 +71,7 @@ def test_parse_memory_file(mocker): def test_parse_cpu_file(mocker): with pytest.raises(FileNotFoundError): - parse_cpu_file("non_existent_file.txt", 1) + parse_cpu_file("non_existent_file.txt", 2) # Mock the 'open' function call to return a file object. mock_file = mocker.mock_open(read_data="usage_usec 1000000") @@ -83,7 +83,7 @@ def test_parse_cpu_file(mocker): mock_file = mocker.mock_open(read_data="1000000") mocker.patch("builtins.open", mock_file) - assert parse_cpu_file("mocked_file.txt", 2) == 1 + assert parse_cpu_file("mocked_file.txt", 1) == 1 mock_file.assert_called_once_with("mocked_file.txt", "r") @@ -114,11 +114,11 @@ def test_get_cgroup_version(mocker): # Mock the Path.exists function call to return True mocker.patch("pathlib.Path.exists", return_value=True) assert get_cgroup_version('stuff/in/place', - 'more_stuff') == 1 + 'more_stuff') == 2 with mock.patch('pathlib.Path.exists', side_effect=[False, True]): assert get_cgroup_version('stuff/in/place', - 'more_stuff') == 2 + 'more_stuff') == 1 # Mock the Path.exists function call to return False mocker.patch("pathlib.Path.exists", return_value=False) @@ -129,12 +129,12 @@ def test_get_cgroup_version(mocker): def test_get_cgroup_paths(): - process = get_cgroup_paths(1, "test_location/", + process = get_cgroup_paths(2, "test_location/", "test_name") assert process.cgroup_memory_path == "test_location/test_name/memory.peak" assert process.cgroup_cpu_path == "test_location/test_name/cpu.stat" - process = get_cgroup_paths(2, "test_location", + process = get_cgroup_paths(1, "test_location", "/test_name") assert (process.cgroup_memory_path == "test_location/memory/test_name/memory.max_usage_in_bytes") From 31f2c1805f12ba25a473d8910923ee07de6b926d Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Fri, 25 Jul 2025 11:53:56 +0100 Subject: [PATCH 063/101] Updating unit tests --- tests/unit/scripts/test_profiler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/scripts/test_profiler.py b/tests/unit/scripts/test_profiler.py index 2bc06dc8d2a..1e69981e954 100644 --- a/tests/unit/scripts/test_profiler.py +++ b/tests/unit/scripts/test_profiler.py @@ -74,14 +74,14 @@ def test_parse_cpu_file(mocker): parse_cpu_file("non_existent_file.txt", 2) # Mock the 'open' function call to return a file object. - mock_file = mocker.mock_open(read_data="usage_usec 1000000") + mock_file = mocker.mock_open(read_data="1000000") mocker.patch("builtins.open", mock_file) assert parse_cpu_file( "mocked_file.txt", 1) == 1000 mock_file.assert_called_once_with("mocked_file.txt", "r") - mock_file = mocker.mock_open(read_data="1000000") + mock_file = mocker.mock_open(read_data="usage_usec 1000000") mocker.patch("builtins.open", mock_file) assert parse_cpu_file("mocked_file.txt", 1) == 1 mock_file.assert_called_once_with("mocked_file.txt", "r") From 038348aaae1402459180ed5abf00ded8d4d84672 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Mon, 28 Jul 2025 16:07:31 +0100 Subject: [PATCH 064/101] Updating unit tests --- tests/unit/scripts/test_profiler.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/unit/scripts/test_profiler.py b/tests/unit/scripts/test_profiler.py index 1e69981e954..886e2987bed 100644 --- a/tests/unit/scripts/test_profiler.py +++ b/tests/unit/scripts/test_profiler.py @@ -76,14 +76,13 @@ def test_parse_cpu_file(mocker): # Mock the 'open' function call to return a file object. mock_file = mocker.mock_open(read_data="1000000") mocker.patch("builtins.open", mock_file) - - assert parse_cpu_file( - "mocked_file.txt", 1) == 1000 + assert parse_cpu_file("mocked_file.txt", 1) == 1 mock_file.assert_called_once_with("mocked_file.txt", "r") mock_file = mocker.mock_open(read_data="usage_usec 1000000") mocker.patch("builtins.open", mock_file) - assert parse_cpu_file("mocked_file.txt", 1) == 1 + assert parse_cpu_file( + "mocked_file.txt", 2) == 1000 mock_file.assert_called_once_with("mocked_file.txt", "r") From 2975ea4fc637c49beb7d8977651cf97217eb9782 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Tue, 29 Jul 2025 16:01:34 +0100 Subject: [PATCH 065/101] Updating unit tests --- tests/functional/jobscript/04-profiler-e2e.t | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/functional/jobscript/04-profiler-e2e.t b/tests/functional/jobscript/04-profiler-e2e.t index ec48664453b..4f676cc65b5 100644 --- a/tests/functional/jobscript/04-profiler-e2e.t +++ b/tests/functional/jobscript/04-profiler-e2e.t @@ -48,7 +48,7 @@ init_workflow "${TEST_NAME_BASE}" <<'__FLOW_CONFIG__' [[the_good]] # this task should succeeded normally platform = localhost - script = sleep 1 + script = sleep 5 [[the_bad]] # this task should fail (it should still send profiling info) platform = localhost From 5e994fbf7f4adc7f0036be46f67bd1ae1efa00bd Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Tue, 29 Jul 2025 16:27:37 +0100 Subject: [PATCH 066/101] updating .mailmap --- .mailmap | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.mailmap b/.mailmap index c4e4f6a2525..20084d2b73a 100644 --- a/.mailmap +++ b/.mailmap @@ -58,4 +58,4 @@ github-actions[bot] github-actions[bot] GitHub Action Diquan Jabbour <165976689+Diquan-BOM@users.noreply.github.com> Maxime Rio -Christopher Bennett christopher.bennett +Christopher Bennett ChrisPaulBennett From 5c8585eefe45e8d1938570548e4e748a6f05e6c8 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Fri, 1 Aug 2025 08:53:44 +0100 Subject: [PATCH 067/101] Updating .mailmap --- .mailmap | 1 + 1 file changed, 1 insertion(+) diff --git a/.mailmap b/.mailmap index 20084d2b73a..5f1a688518b 100644 --- a/.mailmap +++ b/.mailmap @@ -59,3 +59,4 @@ github-actions[bot] GitHub Action Diquan Jabbour <165976689+Diquan-BOM@users.noreply.github.com> Maxime Rio Christopher Bennett ChrisPaulBennett +Christopher Bennett christopher.bennett \ No newline at end of file From 0c68cecd5c8a1ea7488b9b35ffe3f7d0ca03216e Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Mon, 4 Aug 2025 16:02:03 +0100 Subject: [PATCH 068/101] testing macos --- cylc/flow/etc/job.sh | 1 + cylc/flow/scripts/profiler.py | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/cylc/flow/etc/job.sh b/cylc/flow/etc/job.sh index 3cf7851eb73..ed91990f570 100755 --- a/cylc/flow/etc/job.sh +++ b/cylc/flow/etc/job.sh @@ -202,6 +202,7 @@ cylc__set_return() { cylc__kill_profiler() { if [[ -n "${profiler_pid:-}" && -d "/proc/${profiler_pid}" ]]; then kill -s SIGINT "${profiler_pid}" || true + timeout 30 wait "${profiler_pid}" || echo "WARNING: profiler did not exit in time" fi } diff --git a/cylc/flow/scripts/profiler.py b/cylc/flow/scripts/profiler.py index e6a1fc1a7fa..2d533aceed9 100755 --- a/cylc/flow/scripts/profiler.py +++ b/cylc/flow/scripts/profiler.py @@ -82,7 +82,7 @@ class Process: def stop_profiler(*args): """This function will be executed when the SIGINT signal is sent to this process""" - + print("received signal to stop profiler") # If a task fails instantly, or finishes very quickly (< 1 second), # the get config function doesn't have time to run if (max_rss_location is None @@ -115,11 +115,13 @@ def stop_profiler(*args): timeout=comms_timeout) async def send_cylc_message(): + print("before message is sent") await pclient.async_request( 'graphql', {'request_string': GRAPHQL_MUTATION, 'variables': GRAPHQL_REQUEST_VARIABLES}, ) + print("After message is sent") asyncio.run(send_cylc_message()) sys.exit(0) @@ -223,6 +225,7 @@ def get_config(args): # Find the cgroup that this process is running in. # Cylc will put this profiler in the same cgroup # as the job it is profiling + print("profiler started") cgroup_name = get_cgroup_name() cgroup_version = get_cgroup_version(args.cgroup_location, cgroup_name) process = get_cgroup_paths(cgroup_version, From 7eddcf2fba2b5c37b9d939c32a3e658094bf8c75 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Tue, 5 Aug 2025 14:06:31 +0100 Subject: [PATCH 069/101] testing macos --- cylc/flow/scripts/profiler.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/cylc/flow/scripts/profiler.py b/cylc/flow/scripts/profiler.py index 2d533aceed9..edb16c0f8d5 100755 --- a/cylc/flow/scripts/profiler.py +++ b/cylc/flow/scripts/profiler.py @@ -225,13 +225,18 @@ def get_config(args): # Find the cgroup that this process is running in. # Cylc will put this profiler in the same cgroup # as the job it is profiling - print("profiler started") + print("Getting cgroup name") cgroup_name = get_cgroup_name() + print("Cgroup name is:", cgroup_name) + print("Getting cgroup version") cgroup_version = get_cgroup_version(args.cgroup_location, cgroup_name) + print("Cgroup version is:", cgroup_version) + print("Getting cgroup paths") process = get_cgroup_paths(cgroup_version, args.cgroup_location, cgroup_name) - + print("Cgroup paths are:", process) + print("Starting profiler") profile(process, cgroup_version, args.delay) From 2cd9991a4f65f629ea2e45b62630a5943b0a2c8d Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Tue, 5 Aug 2025 14:32:28 +0100 Subject: [PATCH 070/101] testing macos --- cylc/flow/etc/job.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cylc/flow/etc/job.sh b/cylc/flow/etc/job.sh index ed91990f570..d9d5d675220 100755 --- a/cylc/flow/etc/job.sh +++ b/cylc/flow/etc/job.sh @@ -200,8 +200,11 @@ cylc__set_return() { ############################################################################### # Save the data using cylc message and exit the profiler cylc__kill_profiler() { + echo "Killing profiler" if [[ -n "${profiler_pid:-}" && -d "/proc/${profiler_pid}" ]]; then + echo "Sending SIGINT to profiler pid ${profiler_pid}" kill -s SIGINT "${profiler_pid}" || true + echo "Waiting for profiler to exit" timeout 30 wait "${profiler_pid}" || echo "WARNING: profiler did not exit in time" fi } From c1a5686950dd7fe296ed6d5a50cc644b340b18c0 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Tue, 5 Aug 2025 14:53:01 +0100 Subject: [PATCH 071/101] testing macos --- cylc/flow/etc/job.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cylc/flow/etc/job.sh b/cylc/flow/etc/job.sh index d9d5d675220..eac9377be66 100755 --- a/cylc/flow/etc/job.sh +++ b/cylc/flow/etc/job.sh @@ -201,7 +201,9 @@ cylc__set_return() { # Save the data using cylc message and exit the profiler cylc__kill_profiler() { echo "Killing profiler" - if [[ -n "${profiler_pid:-}" && -d "/proc/${profiler_pid}" ]]; then + echo "${profiler_pid}" + ls /proc/ + if [[ -n "${profiler_pid:-}" && $(ps -p "${profiler_pid}" -o pid=) ]]; then echo "Sending SIGINT to profiler pid ${profiler_pid}" kill -s SIGINT "${profiler_pid}" || true echo "Waiting for profiler to exit" From 2bc9ec600af7a9bb60665916edf30032e423a986 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Tue, 5 Aug 2025 15:15:55 +0100 Subject: [PATCH 072/101] testing macos --- cylc/flow/etc/job.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/cylc/flow/etc/job.sh b/cylc/flow/etc/job.sh index eac9377be66..7e6b7254b8d 100755 --- a/cylc/flow/etc/job.sh +++ b/cylc/flow/etc/job.sh @@ -207,7 +207,6 @@ cylc__kill_profiler() { echo "Sending SIGINT to profiler pid ${profiler_pid}" kill -s SIGINT "${profiler_pid}" || true echo "Waiting for profiler to exit" - timeout 30 wait "${profiler_pid}" || echo "WARNING: profiler did not exit in time" fi } From 8ca760d1cdb53e90c2222cc4a357f45801ae76a1 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Tue, 5 Aug 2025 15:39:06 +0100 Subject: [PATCH 073/101] testing macos --- cylc/flow/etc/job.sh | 7 +------ cylc/flow/scripts/profiler.py | 10 ---------- tests/functional/jobscript/04-profiler-e2e.t | 4 ++++ 3 files changed, 5 insertions(+), 16 deletions(-) diff --git a/cylc/flow/etc/job.sh b/cylc/flow/etc/job.sh index 7e6b7254b8d..3cf7851eb73 100755 --- a/cylc/flow/etc/job.sh +++ b/cylc/flow/etc/job.sh @@ -200,13 +200,8 @@ cylc__set_return() { ############################################################################### # Save the data using cylc message and exit the profiler cylc__kill_profiler() { - echo "Killing profiler" - echo "${profiler_pid}" - ls /proc/ - if [[ -n "${profiler_pid:-}" && $(ps -p "${profiler_pid}" -o pid=) ]]; then - echo "Sending SIGINT to profiler pid ${profiler_pid}" + if [[ -n "${profiler_pid:-}" && -d "/proc/${profiler_pid}" ]]; then kill -s SIGINT "${profiler_pid}" || true - echo "Waiting for profiler to exit" fi } diff --git a/cylc/flow/scripts/profiler.py b/cylc/flow/scripts/profiler.py index edb16c0f8d5..3290f132324 100755 --- a/cylc/flow/scripts/profiler.py +++ b/cylc/flow/scripts/profiler.py @@ -82,7 +82,6 @@ class Process: def stop_profiler(*args): """This function will be executed when the SIGINT signal is sent to this process""" - print("received signal to stop profiler") # If a task fails instantly, or finishes very quickly (< 1 second), # the get config function doesn't have time to run if (max_rss_location is None @@ -115,13 +114,11 @@ def stop_profiler(*args): timeout=comms_timeout) async def send_cylc_message(): - print("before message is sent") await pclient.async_request( 'graphql', {'request_string': GRAPHQL_MUTATION, 'variables': GRAPHQL_REQUEST_VARIABLES}, ) - print("After message is sent") asyncio.run(send_cylc_message()) sys.exit(0) @@ -225,18 +222,11 @@ def get_config(args): # Find the cgroup that this process is running in. # Cylc will put this profiler in the same cgroup # as the job it is profiling - print("Getting cgroup name") cgroup_name = get_cgroup_name() - print("Cgroup name is:", cgroup_name) - print("Getting cgroup version") cgroup_version = get_cgroup_version(args.cgroup_location, cgroup_name) - print("Cgroup version is:", cgroup_version) - print("Getting cgroup paths") process = get_cgroup_paths(cgroup_version, args.cgroup_location, cgroup_name) - print("Cgroup paths are:", process) - print("Starting profiler") profile(process, cgroup_version, args.delay) diff --git a/tests/functional/jobscript/04-profiler-e2e.t b/tests/functional/jobscript/04-profiler-e2e.t index 4f676cc65b5..c9e8bc2a2aa 100644 --- a/tests/functional/jobscript/04-profiler-e2e.t +++ b/tests/functional/jobscript/04-profiler-e2e.t @@ -20,6 +20,10 @@ # The test platform may need to be configured for this to work (e.g. # "cgroups path" may need to be set). +if [[ "$OSTYPE" == "darwin"* ]]; then + skip_all "Test not compatible with Mac OS" +fi + . "$(dirname "$0")/test_header" set_test_number 7 From ebbb8f087407f15c4438dd45917dab9b5df1a2de Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Tue, 5 Aug 2025 16:12:12 +0100 Subject: [PATCH 074/101] testing macos --- tests/functional/jobscript/04-profiler-e2e.t | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/functional/jobscript/04-profiler-e2e.t b/tests/functional/jobscript/04-profiler-e2e.t index c9e8bc2a2aa..9e9c08b76db 100644 --- a/tests/functional/jobscript/04-profiler-e2e.t +++ b/tests/functional/jobscript/04-profiler-e2e.t @@ -22,7 +22,7 @@ if [[ "$OSTYPE" == "darwin"* ]]; then skip_all "Test not compatible with Mac OS" -fi +else . "$(dirname "$0")/test_header" set_test_number 7 @@ -84,3 +84,5 @@ log_scan "${TEST_NAME_BASE}-task-succeeded" \ grep_workflow_log_ok "${TEST_NAME_BASE}-broken" '1/the_ugly.*(received)succeeded' purge + +fi \ No newline at end of file From 6f65ebd5502304c3a681bc913d328d1dd33bb347 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Wed, 6 Aug 2025 09:12:45 +0100 Subject: [PATCH 075/101] testing macos --- tests/functional/jobscript/04-profiler-e2e.t | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/functional/jobscript/04-profiler-e2e.t b/tests/functional/jobscript/04-profiler-e2e.t index 9e9c08b76db..2a9d93d2352 100644 --- a/tests/functional/jobscript/04-profiler-e2e.t +++ b/tests/functional/jobscript/04-profiler-e2e.t @@ -20,7 +20,7 @@ # The test platform may need to be configured for this to work (e.g. # "cgroups path" may need to be set). -if [[ "$OSTYPE" == "darwin"* ]]; then +if [[ "$OSTYPE" == "darwin*" ]]; then skip_all "Test not compatible with Mac OS" else From 54553c1faa3a9a60d501d816c217e63f2375ad59 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Wed, 6 Aug 2025 10:16:24 +0100 Subject: [PATCH 076/101] testing macos --- tests/functional/jobscript/04-profiler-e2e.t | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/functional/jobscript/04-profiler-e2e.t b/tests/functional/jobscript/04-profiler-e2e.t index 2a9d93d2352..6825e532096 100644 --- a/tests/functional/jobscript/04-profiler-e2e.t +++ b/tests/functional/jobscript/04-profiler-e2e.t @@ -20,7 +20,10 @@ # The test platform may need to be configured for this to work (e.g. # "cgroups path" may need to be set). +echo 'testing OS' +echo "$OSTYPE" if [[ "$OSTYPE" == "darwin*" ]]; then + echo "Skipping test on Mac OS" skip_all "Test not compatible with Mac OS" else From 556f3ba1abdd1d53b24b2e3304bf8ea640aa92c3 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Wed, 6 Aug 2025 11:00:46 +0100 Subject: [PATCH 077/101] testing macos --- tests/functional/jobscript/04-profiler-e2e.t | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/tests/functional/jobscript/04-profiler-e2e.t b/tests/functional/jobscript/04-profiler-e2e.t index 6825e532096..94ccb16d94f 100644 --- a/tests/functional/jobscript/04-profiler-e2e.t +++ b/tests/functional/jobscript/04-profiler-e2e.t @@ -20,14 +20,12 @@ # The test platform may need to be configured for this to work (e.g. # "cgroups path" may need to be set). -echo 'testing OS' -echo "$OSTYPE" -if [[ "$OSTYPE" == "darwin*" ]]; then - echo "Skipping test on Mac OS" - skip_all "Test not compatible with Mac OS" -else - . "$(dirname "$0")/test_header" + +if [[ "$OSTYPE" != "darwin"* ]]; then + skip_all "Tests not compatibile with $OSTYPE" +fi + set_test_number 7 mkdir -p "${PWD}/cgroups_test_data" @@ -87,5 +85,3 @@ log_scan "${TEST_NAME_BASE}-task-succeeded" \ grep_workflow_log_ok "${TEST_NAME_BASE}-broken" '1/the_ugly.*(received)succeeded' purge - -fi \ No newline at end of file From f70c1d33455052408916bcfee167357e71b697fd Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Wed, 6 Aug 2025 11:33:05 +0100 Subject: [PATCH 078/101] testing macos --- tests/functional/jobscript/04-profiler-e2e.t | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/functional/jobscript/04-profiler-e2e.t b/tests/functional/jobscript/04-profiler-e2e.t index 94ccb16d94f..053ba348e82 100644 --- a/tests/functional/jobscript/04-profiler-e2e.t +++ b/tests/functional/jobscript/04-profiler-e2e.t @@ -22,7 +22,7 @@ . "$(dirname "$0")/test_header" -if [[ "$OSTYPE" != "darwin"* ]]; then +if [[ "$OSTYPE" != "linux-gnu"* ]]; then skip_all "Tests not compatibile with $OSTYPE" fi From 1e93434c4d6b66241c26eb4dab87785c6cec870b Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Mon, 20 Oct 2025 15:11:08 +0100 Subject: [PATCH 079/101] change how profiler is killed to be POSIX compliant --- cylc/flow/etc/job.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cylc/flow/etc/job.sh b/cylc/flow/etc/job.sh index 3cf7851eb73..16849e9f4b2 100755 --- a/cylc/flow/etc/job.sh +++ b/cylc/flow/etc/job.sh @@ -200,7 +200,7 @@ cylc__set_return() { ############################################################################### # Save the data using cylc message and exit the profiler cylc__kill_profiler() { - if [[ -n "${profiler_pid:-}" && -d "/proc/${profiler_pid}" ]]; then + if [[ -n "${profiler_pid:-}" ]] && ps -p "$profiler_pid" > /dev/null; then kill -s SIGINT "${profiler_pid}" || true fi } From 797b8ae4e99db7a3542f6b5917fb8ea43e56ff20 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Mon, 20 Oct 2025 15:17:37 +0100 Subject: [PATCH 080/101] Added recording of memory allocation --- cylc/flow/scripts/profiler.py | 76 ++++++++++++++++++++++++++--------- 1 file changed, 58 insertions(+), 18 deletions(-) diff --git a/cylc/flow/scripts/profiler.py b/cylc/flow/scripts/profiler.py index 3290f132324..b602d91cd41 100755 --- a/cylc/flow/scripts/profiler.py +++ b/cylc/flow/scripts/profiler.py @@ -37,6 +37,7 @@ RE_INT = re.compile(r'\d+') max_rss_location = None cpu_time_location = None +memory_allocated_path = None cgroup_version = None comms_timeout = None @@ -77,6 +78,7 @@ class Process: """Class for representing CPU and Memory usage of a process""" cgroup_memory_path: str cgroup_cpu_path: str + memory_allocated_path: str def stop_profiler(*args): @@ -89,9 +91,11 @@ def stop_profiler(*args): or cgroup_version is None): max_rss = 0 cpu_time = 0 + memory_allocated = 0 else: - max_rss = parse_memory_file(max_rss_location) + max_rss = parse_memory_file(max_rss_location, cgroup_version) cpu_time = parse_cpu_file(cpu_time_location, cgroup_version) + memory_allocated = parse_memory_allocated(memory_allocated_path, cgroup_version) GRAPHQL_MUTATION = """ mutation($WORKFLOWS: [WorkflowID]!, @@ -105,7 +109,7 @@ def stop_profiler(*args): GRAPHQL_REQUEST_VARIABLES = { "WORKFLOWS": [os.environ.get('CYLC_WORKFLOW_ID')], - "MESSAGES": [["DEBUG", f"cpu_time {cpu_time} max_rss {max_rss}"]], + "MESSAGES": [["DEBUG", f"cpu_time {cpu_time} max_rss {max_rss} mem_alloc {memory_allocated}"]], "JOB": os.environ.get('CYLC_TASK_JOB'), "TIME": "now" } @@ -124,15 +128,43 @@ async def send_cylc_message(): sys.exit(0) -def parse_memory_file(cgroup_memory_path): +def parse_memory_file(cgroup_memory_path, cgroup_version): """Open the memory stat file and copy the appropriate data""" - with open(cgroup_memory_path, 'r') as f: - for line in f: - return int(line) // 1024 + cgroup_memory_path = Path(cgroup_memory_path) + + if cgroup_version == 2: + with open(cgroup_memory_path, 'r') as f: + for line in f: + if "anon" in line: + return int(''.join(filter(str.isdigit, line))) + else: + with open(cgroup_memory_path, 'r') as f: + for line in f: + return int(line) -def parse_cpu_file(cgroup_cpu_path, cgroup_version): +def parse_memory_allocated(cgroup_memory_path, cgroup_version) -> int: + """Open the memory stat file and copy the appropriate data""" + + if cgroup_version == 2: + cgroup_memory_path = Path(cgroup_memory_path) + + for i in range(5): + with open(cgroup_memory_path / "memory.max", 'r') as f: + line = f.readline() + if "max" not in line: + return int(line) + cgroup_memory_path = cgroup_memory_path.parent + if i == 5: + break + elif cgroup_version == 1: + return 0 # Memory limit not tracked for cgroups v1 + + raise FileNotFoundError("Could not find memory.max file") + + +def parse_cpu_file(cgroup_cpu_path, cgroup_version) -> int: """Open the memory stat file and return the appropriate data""" if cgroup_version == 2: @@ -140,15 +172,16 @@ def parse_cpu_file(cgroup_cpu_path, cgroup_version): for line in f: if "usage_usec" in line: return int(RE_INT.findall(line)[0]) // 1000 - elif cgroup_version == 1: + raise ValueError("Unable to find cpu usage data") + else: with open(cgroup_cpu_path, 'r') as f: for line in f: # Cgroups v2 uses nanoseconds - return int(line) / 1000000 + return int(line) // 1000000 + raise ValueError("Unable to find cpu usage data") def get_cgroup_version(cgroup_location: str, cgroup_name: str) -> int: - # HPC uses cgroups v2 and SPICE uses cgroups v1 global cgroup_version if Path.exists(Path(cgroup_location + cgroup_name)): cgroup_version = 2 @@ -161,7 +194,7 @@ def get_cgroup_version(cgroup_location: str, cgroup_name: str) -> int: cgroup_location + cgroup_name) -def get_cgroup_name(): +def get_cgroup_name() -> str: """Get the cgroup directory for the current process""" # fugly hack to allow functional tests to use test data @@ -184,28 +217,36 @@ def get_cgroup_name(): raise AttributeError("No cgroup found for process:", pid) from err -def get_cgroup_paths(version, location, name): +def get_cgroup_paths(version, location, name) -> Process: global max_rss_location global cpu_time_location + global memory_allocated_path + if version == 2: - max_rss_location = location + name + "/" + "memory.peak" + max_rss_location = location + name + "/" + "memory.stat" cpu_time_location = location + name + "/" + "cpu.stat" + memory_allocated_path = location + name return Process( cgroup_memory_path=location + - name + "/" + "memory.peak", + name + "/" + "memory.stat", cgroup_cpu_path=location + - name + "/" + "cpu.stat") + name + "/" + "cpu.stat", + memory_allocated_path=location + name) elif version == 1: max_rss_location = (location + "/memory" + name + "/memory.max_usage_in_bytes") cpu_time_location = (location + "/cpu" + name + "/cpuacct.usage") + memory_allocated_path = location + name + "/" + "memory.limit_in_bytes" return Process( cgroup_memory_path=location + "/memory" + - name + "/memory.max_usage_in_bytes", + name + "/memory.max_usage_in_bytes", cgroup_cpu_path=location + "/cpu" + - name + "/cpuacct.usage") + name + "/cpuacct.usage", + memory_allocated_path="") + + raise ValueError("Unable to determine cgroup version") def profile(process, version, delay, keep_looping=lambda: True): @@ -231,6 +272,5 @@ def get_config(args): if __name__ == "__main__": - arg_parser = get_option_parser() get_config(arg_parser.parse_args([])) From 5bfaf050795dbd05126537a18270ab810c09b9b5 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Mon, 20 Oct 2025 15:48:17 +0100 Subject: [PATCH 081/101] Updating unit tests --- tests/unit/scripts/test_profiler.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/tests/unit/scripts/test_profiler.py b/tests/unit/scripts/test_profiler.py index 886e2987bed..a82c973d1c0 100644 --- a/tests/unit/scripts/test_profiler.py +++ b/tests/unit/scripts/test_profiler.py @@ -23,6 +23,7 @@ stop_profiler, profile) import pytest +from pathlib import Path from unittest import mock @@ -56,17 +57,18 @@ async def async_request(self, *a, **k): def test_parse_memory_file(mocker): with pytest.raises(FileNotFoundError): - parse_memory_file("non_existent_file.txt") + parse_memory_file("non_existent_file.txt", 2) # Mock the 'open' function call to return a file object. - mock_file = mocker.mock_open(read_data="1024") + mock_file = mocker.mock_open( + read_data="stuff=things\nanon=1024\nthings=stuff") mocker.patch("builtins.open", mock_file) # Test the parse_memory_file function - assert parse_memory_file("mocked_file.txt") == 1 + assert parse_memory_file("mocked_file.txt", 2) == 1024 # Assert that the 'open' function was called with the expected arguments. - mock_file.assert_called_once_with("mocked_file.txt", "r") + mock_file.assert_called_once_with(Path("mocked_file.txt"), "r") def test_parse_cpu_file(mocker): @@ -130,7 +132,7 @@ def test_get_cgroup_paths(): process = get_cgroup_paths(2, "test_location/", "test_name") - assert process.cgroup_memory_path == "test_location/test_name/memory.peak" + assert process.cgroup_memory_path == "test_location/test_name/memory.stat" assert process.cgroup_cpu_path == "test_location/test_name/cpu.stat" process = get_cgroup_paths(1, "test_location", From 3c32069f8e916335d8b366d173397420197b267b Mon Sep 17 00:00:00 2001 From: Oliver Sanders Date: Tue, 21 Oct 2025 11:53:34 +0100 Subject: [PATCH 082/101] profiler: remove global variables --- cylc/flow/scripts/profiler.py | 193 +++++++++++++++------------------- 1 file changed, 87 insertions(+), 106 deletions(-) diff --git a/cylc/flow/scripts/profiler.py b/cylc/flow/scripts/profiler.py index b602d91cd41..b1dc651602e 100755 --- a/cylc/flow/scripts/profiler.py +++ b/cylc/flow/scripts/profiler.py @@ -20,57 +20,24 @@ the resource usage of jobs running on the node. """ +import asyncio +from dataclasses import dataclass +from functools import partial import os +from pathlib import Path import re +import signal import sys import time -import signal -import asyncio -from pathlib import Path -from dataclasses import dataclass -from cylc.flow.terminal import cli_function + from cylc.flow.network.client_factory import get_client from cylc.flow.option_parsers import CylcOptionParser as COP +from cylc.flow.terminal import cli_function + INTERNAL = True PID_REGEX = re.compile(r"([^:]*\d{6,}.*)") RE_INT = re.compile(r'\d+') -max_rss_location = None -cpu_time_location = None -memory_allocated_path = None -cgroup_version = None -comms_timeout = None - - -def get_option_parser() -> COP: - parser = COP( - __doc__, - comms=True, - argdoc=[ - ], - ) - parser.add_option( - "-i", type=int, - help="interval between query cycles in seconds", dest="delay") - parser.add_option( - "-m", type=str, help="Location of cgroups directory", - dest="cgroup_location") - - return parser - - -@cli_function(get_option_parser) -def main(parser: COP, options) -> None: - """CLI main.""" - global comms_timeout - # Register the stop_profiler function with the signal library - signal.signal(signal.SIGINT, stop_profiler) - signal.signal(signal.SIGHUP, stop_profiler) - signal.signal(signal.SIGTERM, stop_profiler) - - comms_timeout = options.comms_timeout - - get_config(options) @dataclass @@ -79,25 +46,26 @@ class Process: cgroup_memory_path: str cgroup_cpu_path: str memory_allocated_path: str + cgroup_version: int -def stop_profiler(*args): +def stop_profiler(process, comms_timeout, *args): """This function will be executed when the SIGINT signal is sent to this process""" # If a task fails instantly, or finishes very quickly (< 1 second), # the get config function doesn't have time to run - if (max_rss_location is None - or cpu_time_location is None - or cgroup_version is None): + if (process.max_rss_location is None + or process.cpu_time_location is None + or process.cgroup_version is None): max_rss = 0 cpu_time = 0 memory_allocated = 0 else: - max_rss = parse_memory_file(max_rss_location, cgroup_version) - cpu_time = parse_cpu_file(cpu_time_location, cgroup_version) - memory_allocated = parse_memory_allocated(memory_allocated_path, cgroup_version) + max_rss = parse_memory_file(process) + cpu_time = parse_cpu_file(process) + memory_allocated = parse_memory_allocated(process) - GRAPHQL_MUTATION = """ + graphql_mutation = """ mutation($WORKFLOWS: [WorkflowID]!, $MESSAGES: [[String]], $JOB: String!, $TIME: String) { message(workflows: $WORKFLOWS, messages:$MESSAGES, @@ -107,7 +75,7 @@ def stop_profiler(*args): } """ - GRAPHQL_REQUEST_VARIABLES = { + graphql_request_variables = { "WORKFLOWS": [os.environ.get('CYLC_WORKFLOW_ID')], "MESSAGES": [["DEBUG", f"cpu_time {cpu_time} max_rss {max_rss} mem_alloc {memory_allocated}"]], "JOB": os.environ.get('CYLC_TASK_JOB'), @@ -120,35 +88,33 @@ def stop_profiler(*args): async def send_cylc_message(): await pclient.async_request( 'graphql', - {'request_string': GRAPHQL_MUTATION, - 'variables': GRAPHQL_REQUEST_VARIABLES}, + {'request_string': graphql_mutation, + 'variables': graphql_request_variables}, ) asyncio.run(send_cylc_message()) sys.exit(0) -def parse_memory_file(cgroup_memory_path, cgroup_version): +def parse_memory_file(process: Process): """Open the memory stat file and copy the appropriate data""" + cgroup_memory_path = Path(process.cgroup_memory_path) - cgroup_memory_path = Path(cgroup_memory_path) - - if cgroup_version == 2: + if process.cgroup_version == 2: with open(cgroup_memory_path, 'r') as f: for line in f: if "anon" in line: return int(''.join(filter(str.isdigit, line))) else: - with open(cgroup_memory_path, 'r') as f: + with open(process.cgroup_memory_path, 'r') as f: for line in f: return int(line) -def parse_memory_allocated(cgroup_memory_path, cgroup_version) -> int: +def parse_memory_allocated(process: Process) -> int: """Open the memory stat file and copy the appropriate data""" - - if cgroup_version == 2: - cgroup_memory_path = Path(cgroup_memory_path) + if process.cgroup_version == 2: + cgroup_memory_path = Path(process.cgroup_memory_path) for i in range(5): with open(cgroup_memory_path / "memory.max", 'r') as f: @@ -158,23 +124,22 @@ def parse_memory_allocated(cgroup_memory_path, cgroup_version) -> int: cgroup_memory_path = cgroup_memory_path.parent if i == 5: break - elif cgroup_version == 1: + elif process.cgroup_version == 1: return 0 # Memory limit not tracked for cgroups v1 raise FileNotFoundError("Could not find memory.max file") -def parse_cpu_file(cgroup_cpu_path, cgroup_version) -> int: +def parse_cpu_file(process: Process) -> int: """Open the memory stat file and return the appropriate data""" - - if cgroup_version == 2: - with open(cgroup_cpu_path, 'r') as f: + if process.cgroup_version == 2: + with open(process.cgroup_cpu_path, 'r') as f: for line in f: if "usage_usec" in line: return int(RE_INT.findall(line)[0]) // 1000 raise ValueError("Unable to find cpu usage data") else: - with open(cgroup_cpu_path, 'r') as f: + with open(process.cgroup_cpu_path, 'r') as f: for line in f: # Cgroups v2 uses nanoseconds return int(line) // 1000000 @@ -182,13 +147,10 @@ def parse_cpu_file(cgroup_cpu_path, cgroup_version) -> int: def get_cgroup_version(cgroup_location: str, cgroup_name: str) -> int: - global cgroup_version if Path.exists(Path(cgroup_location + cgroup_name)): - cgroup_version = 2 - return cgroup_version + return 2 elif Path.exists(Path(cgroup_location + "/memory" + cgroup_name)): - cgroup_version = 1 - return cgroup_version + return 1 else: raise FileNotFoundError("Cgroup not found at " + cgroup_location + cgroup_name) @@ -217,39 +179,33 @@ def get_cgroup_name() -> str: raise AttributeError("No cgroup found for process:", pid) from err -def get_cgroup_paths(version, location, name) -> Process: - global max_rss_location - global cpu_time_location - global memory_allocated_path - - if version == 2: - max_rss_location = location + name + "/" + "memory.stat" - cpu_time_location = location + name + "/" + "cpu.stat" - memory_allocated_path = location + name +def get_cgroup_paths(location) -> Process: + cgroup_name = get_cgroup_name() + cgroup_version = get_cgroup_version(location, cgroup_name) + if cgroup_version == 2: return Process( cgroup_memory_path=location + - name + "/" + "memory.stat", + cgroup_name + "/" + "memory.stat", cgroup_cpu_path=location + - name + "/" + "cpu.stat", - memory_allocated_path=location + name) - - elif version == 1: - max_rss_location = (location + "/memory" + - name + "/memory.max_usage_in_bytes") - cpu_time_location = (location + "/cpu" + - name + "/cpuacct.usage") - memory_allocated_path = location + name + "/" + "memory.limit_in_bytes" + cgroup_name + "/" + "cpu.stat", + memory_allocated_path=location + cgroup_name, + cgroup_version=cgroup_version, + ) + + elif cgroup_version == 1: return Process( cgroup_memory_path=location + "/memory" + - name + "/memory.max_usage_in_bytes", + cgroup_name + "/memory.max_usage_in_bytes", cgroup_cpu_path=location + "/cpu" + - name + "/cpuacct.usage", - memory_allocated_path="") + cgroup_name + "/cpuacct.usage", + memory_allocated_path="", + cgroup_version=cgroup_version, + ) raise ValueError("Unable to determine cgroup version") -def profile(process, version, delay, keep_looping=lambda: True): +def profile(_process: Process, delay, keep_looping=lambda: True): # The infinite loop that will constantly poll the cgroup # The lambda function is used to allow the loop to be stopped in unit tests @@ -259,18 +215,43 @@ def profile(process, version, delay, keep_looping=lambda: True): time.sleep(delay) -def get_config(args): - # Find the cgroup that this process is running in. - # Cylc will put this profiler in the same cgroup - # as the job it is profiling - cgroup_name = get_cgroup_name() - cgroup_version = get_cgroup_version(args.cgroup_location, cgroup_name) - process = get_cgroup_paths(cgroup_version, - args.cgroup_location, - cgroup_name) - profile(process, cgroup_version, args.delay) +def get_option_parser() -> COP: + parser = COP( + __doc__, + comms=True, + argdoc=[ + ], + ) + parser.add_option( + "-i", type=int, + help="interval between query cycles in seconds", dest="delay") + parser.add_option( + "-m", type=str, help="Location of cgroups directory", + dest="cgroup_location") + + return parser + + +@cli_function(get_option_parser) +def main(_parser: COP, options) -> None: + """CLI main.""" + _main(options) + + +def _main(options) -> None: + # get cgroup information + process = get_cgroup_paths(options.cgroup_location) + + # Register the stop_profiler function with the signal library + _stop_profiler = partial(stop_profiler, process, options.comms_timeout) + signal.signal(signal.SIGINT, _stop_profiler) + signal.signal(signal.SIGHUP, _stop_profiler) + signal.signal(signal.SIGTERM, _stop_profiler) + + # run profiler run + profile(process, options.delay) if __name__ == "__main__": arg_parser = get_option_parser() - get_config(arg_parser.parse_args([])) + _main(arg_parser.parse_args([])) From 8592107d466ca35abeb22ad6a7fd8479f3c1cd64 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Thu, 23 Oct 2025 14:06:42 +0100 Subject: [PATCH 083/101] Refactored to global variables --- cylc/flow/scripts/profiler.py | 25 +++--- tests/unit/scripts/test_profiler.py | 129 +++++++++++++++++++--------- 2 files changed, 101 insertions(+), 53 deletions(-) diff --git a/cylc/flow/scripts/profiler.py b/cylc/flow/scripts/profiler.py index b1dc651602e..7f2a1918ab5 100755 --- a/cylc/flow/scripts/profiler.py +++ b/cylc/flow/scripts/profiler.py @@ -20,15 +20,16 @@ the resource usage of jobs running on the node. """ -import asyncio -from dataclasses import dataclass -from functools import partial import os -from pathlib import Path import re -import signal import sys import time +import signal +import asyncio + +from pathlib import Path +from functools import partial +from dataclasses import dataclass from cylc.flow.network.client_factory import get_client from cylc.flow.option_parsers import CylcOptionParser as COP @@ -54,9 +55,9 @@ def stop_profiler(process, comms_timeout, *args): to this process""" # If a task fails instantly, or finishes very quickly (< 1 second), # the get config function doesn't have time to run - if (process.max_rss_location is None - or process.cpu_time_location is None - or process.cgroup_version is None): + if (process.cgroup_memory_path is None + or process.cgroup_cpu_path is None + or process.memory_allocated_path is None): max_rss = 0 cpu_time = 0 memory_allocated = 0 @@ -101,7 +102,7 @@ def parse_memory_file(process: Process): cgroup_memory_path = Path(process.cgroup_memory_path) if process.cgroup_version == 2: - with open(cgroup_memory_path, 'r') as f: + with open(process.cgroup_memory_path, 'r') as f: for line in f: if "anon" in line: return int(''.join(filter(str.isdigit, line))) @@ -114,7 +115,7 @@ def parse_memory_file(process: Process): def parse_memory_allocated(process: Process) -> int: """Open the memory stat file and copy the appropriate data""" if process.cgroup_version == 2: - cgroup_memory_path = Path(process.cgroup_memory_path) + cgroup_memory_path = Path(process.memory_allocated_path) for i in range(5): with open(cgroup_memory_path / "memory.max", 'r') as f: @@ -194,9 +195,9 @@ def get_cgroup_paths(location) -> Process: elif cgroup_version == 1: return Process( - cgroup_memory_path=location + "/memory" + + cgroup_memory_path=location + "memory/" + cgroup_name + "/memory.max_usage_in_bytes", - cgroup_cpu_path=location + "/cpu" + + cgroup_cpu_path=location + "cpu/" + cgroup_name + "/cpuacct.usage", memory_allocated_path="", cgroup_version=cgroup_version, diff --git a/tests/unit/scripts/test_profiler.py b/tests/unit/scripts/test_profiler.py index a82c973d1c0..d6a0d0e3035 100644 --- a/tests/unit/scripts/test_profiler.py +++ b/tests/unit/scripts/test_profiler.py @@ -21,13 +21,14 @@ get_cgroup_version, get_cgroup_paths, stop_profiler, - profile) + profile, + Process) import pytest from pathlib import Path from unittest import mock -def test_stop_profiler(mocker, monkeypatch): +def test_stop_profiler(mocker, monkeypatch, tmpdir): monkeypatch.setenv('CYLC_WORKFLOW_ID', "test_value") def mock_get_client(env_var, timeout=None): @@ -41,51 +42,88 @@ async def async_request(self, *a, **k): pass mocker.patch("cylc.flow.scripts.profiler.get_client", MockedClient) - max_rss_location = None - cpu_time_location = None - cgroup_version = 1 + mem_file = tmpdir.join("memory_file.txt") + mem_file.write('1234') + cpu_file = tmpdir.join("cpu_file.txt") + cpu_file.write('5678') + mem_allocated_file = tmpdir.join("memory_allocated.txt") + mem_allocated_file.write('99999') + + process_object = Process( + cgroup_memory_path=mem_file, + cgroup_cpu_path=cpu_file, + memory_allocated_path=mem_allocated_file, + cgroup_version=1) with pytest.raises(SystemExit) as excinfo: - stop_profiler(max_rss_location, cpu_time_location, cgroup_version) - assert stop_profiler.max_rss == 0 - assert stop_profiler.cpu_time == 0 + stop_profiler(process_object, 1) + assert excinfo.type == SystemExit assert excinfo.value.code == 0 -def test_parse_memory_file(mocker): - - with pytest.raises(FileNotFoundError): - parse_memory_file("non_existent_file.txt", 2) +def test_parse_memory_file(mocker, tmpdir): - # Mock the 'open' function call to return a file object. - mock_file = mocker.mock_open( - read_data="stuff=things\nanon=1024\nthings=stuff") - mocker.patch("builtins.open", mock_file) + mem_file = tmpdir.join("memory_file.txt") + mem_file.write('1024') + cpu_file = tmpdir.join("cpu_file.txt") + cpu_file.write('5678') + mem_allocated_file = tmpdir.join("memory_allocated.txt") + mem_allocated_file.write('99999') - # Test the parse_memory_file function - assert parse_memory_file("mocked_file.txt", 2) == 1024 + good_process_object = Process( + cgroup_memory_path=mem_file, + cgroup_cpu_path=cpu_file, + memory_allocated_path=mem_allocated_file, + cgroup_version=1) + bad_process_object = Process( + cgroup_memory_path='', + cgroup_cpu_path='', + memory_allocated_path='', + cgroup_version=1) - # Assert that the 'open' function was called with the expected arguments. - mock_file.assert_called_once_with(Path("mocked_file.txt"), "r") + with pytest.raises(FileNotFoundError): + parse_memory_file(bad_process_object) + # Test the parse_memory_file function + assert parse_memory_file(good_process_object) == 1024 + + + +def test_parse_cpu_file(mocker, tmpdir): + + mem_file = tmpdir.join("memory_file.txt") + mem_file.write('1024') + cpu_file_v1 = tmpdir.join("cpu_file_v1.txt") + cpu_file_v1.write('1234567890') + cpu_file_v2 = tmpdir.join("cpu_file_v2.txt") + cpu_file_v2.write('usage_usec=1234567890') + mem_allocated_file = tmpdir.join("memory_allocated.txt") + mem_allocated_file.write('99999') + + good_process_object_v1 = Process( + cgroup_memory_path=mem_file, + cgroup_cpu_path=cpu_file_v1, + memory_allocated_path=mem_allocated_file, + cgroup_version=1) + good_process_object_v2 = Process( + cgroup_memory_path=mem_file, + cgroup_cpu_path=cpu_file_v2, + memory_allocated_path=mem_allocated_file, + cgroup_version=2) + bad_process_object = Process( + cgroup_memory_path='', + cgroup_cpu_path='', + memory_allocated_path='', + cgroup_version=1) -def test_parse_cpu_file(mocker): with pytest.raises(FileNotFoundError): - parse_cpu_file("non_existent_file.txt", 2) + parse_cpu_file(bad_process_object) - # Mock the 'open' function call to return a file object. - mock_file = mocker.mock_open(read_data="1000000") - mocker.patch("builtins.open", mock_file) - assert parse_cpu_file("mocked_file.txt", 1) == 1 - mock_file.assert_called_once_with("mocked_file.txt", "r") + assert parse_cpu_file(good_process_object_v1) == 1234 - mock_file = mocker.mock_open(read_data="usage_usec 1000000") - mocker.patch("builtins.open", mock_file) - assert parse_cpu_file( - "mocked_file.txt", 2) == 1000 - mock_file.assert_called_once_with("mocked_file.txt", "r") + assert parse_cpu_file(good_process_object_v2) == 1234567 def test_get_cgroup_name(mocker): @@ -128,15 +166,21 @@ def test_get_cgroup_version(mocker): 'things') -def test_get_cgroup_paths(): - - process = get_cgroup_paths(2, "test_location/", - "test_name") +def test_get_cgroup_paths(mocker): + mocker.patch("cylc.flow.scripts.profiler.get_cgroup_name", + return_value='test_name') + mocker.patch("cylc.flow.scripts.profiler.get_cgroup_version", + return_value=2) + process = get_cgroup_paths("test_location/") assert process.cgroup_memory_path == "test_location/test_name/memory.stat" assert process.cgroup_cpu_path == "test_location/test_name/cpu.stat" - process = get_cgroup_paths(1, "test_location", - "/test_name") + mocker.patch("cylc.flow.scripts.profiler.get_cgroup_name", + return_value='test_name') + mocker.patch("cylc.flow.scripts.profiler.get_cgroup_version", + return_value=1) + + process = get_cgroup_paths("test_location/") assert (process.cgroup_memory_path == "test_location/memory/test_name/memory.max_usage_in_bytes") assert (process.cgroup_cpu_path == @@ -145,8 +189,11 @@ def test_get_cgroup_paths(): def test_profile_data(mocker): # This test should run without error - process = get_cgroup_paths(1, "test_location/", - "test_name") + mocker.patch("cylc.flow.scripts.profiler.get_cgroup_name", + return_value='test_name') + mocker.patch("cylc.flow.scripts.profiler.get_cgroup_version", + return_value=2) + process = get_cgroup_paths("test_location/") mock_file = mocker.mock_open(read_data="") mocker.patch("builtins.open", mock_file) @@ -155,4 +202,4 @@ def test_profile_data(mocker): mocker.patch("cylc.flow.scripts.profiler.parse_cpu_file", return_value=2048) run_once = mock.Mock(side_effect=[True, False]) - profile(process, 1, 1, run_once) + profile(process, 1, run_once) From 6c959148ecd2d4da44655d457342b93ae2e8d854 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Thu, 23 Oct 2025 14:29:47 +0100 Subject: [PATCH 084/101] Code review changes --- cylc/flow/scripts/profiler.py | 9 +++++---- tests/unit/scripts/test_profiler.py | 20 ++++++++++++++------ 2 files changed, 19 insertions(+), 10 deletions(-) diff --git a/cylc/flow/scripts/profiler.py b/cylc/flow/scripts/profiler.py index 7f2a1918ab5..6dbf0e1673a 100755 --- a/cylc/flow/scripts/profiler.py +++ b/cylc/flow/scripts/profiler.py @@ -109,7 +109,8 @@ def parse_memory_file(process: Process): else: with open(process.cgroup_memory_path, 'r') as f: for line in f: - return int(line) + if "total_rss" in line: + return int(''.join(filter(str.isdigit, line))) def parse_memory_allocated(process: Process) -> int: @@ -132,7 +133,7 @@ def parse_memory_allocated(process: Process) -> int: def parse_cpu_file(process: Process) -> int: - """Open the memory stat file and return the appropriate data""" + """Open the CPU stat file and return the appropriate data""" if process.cgroup_version == 2: with open(process.cgroup_cpu_path, 'r') as f: for line in f: @@ -142,7 +143,7 @@ def parse_cpu_file(process: Process) -> int: else: with open(process.cgroup_cpu_path, 'r') as f: for line in f: - # Cgroups v2 uses nanoseconds + # Cgroups v1 uses nanoseconds return int(line) // 1000000 raise ValueError("Unable to find cpu usage data") @@ -196,7 +197,7 @@ def get_cgroup_paths(location) -> Process: elif cgroup_version == 1: return Process( cgroup_memory_path=location + "memory/" + - cgroup_name + "/memory.max_usage_in_bytes", + cgroup_name + "/memory.stat", cgroup_cpu_path=location + "cpu/" + cgroup_name + "/cpuacct.usage", memory_allocated_path="", diff --git a/tests/unit/scripts/test_profiler.py b/tests/unit/scripts/test_profiler.py index d6a0d0e3035..101a1894c1a 100644 --- a/tests/unit/scripts/test_profiler.py +++ b/tests/unit/scripts/test_profiler.py @@ -65,18 +65,25 @@ async def async_request(self, *a, **k): def test_parse_memory_file(mocker, tmpdir): - mem_file = tmpdir.join("memory_file.txt") - mem_file.write('1024') + mem_file_v1 = tmpdir.join("memory_file_v1.txt") + mem_file_v1.write('total_rss=1024') + mem_file_v2 = tmpdir.join("memory_file_v2.txt") + mem_file_v2.write('anon=666') cpu_file = tmpdir.join("cpu_file.txt") cpu_file.write('5678') mem_allocated_file = tmpdir.join("memory_allocated.txt") mem_allocated_file.write('99999') - good_process_object = Process( - cgroup_memory_path=mem_file, + good_process_object_v1 = Process( + cgroup_memory_path=mem_file_v1, cgroup_cpu_path=cpu_file, memory_allocated_path=mem_allocated_file, cgroup_version=1) + good_process_object_v2 = Process( + cgroup_memory_path=mem_file_v2, + cgroup_cpu_path=cpu_file, + memory_allocated_path=mem_allocated_file, + cgroup_version=2) bad_process_object = Process( cgroup_memory_path='', cgroup_cpu_path='', @@ -87,7 +94,8 @@ def test_parse_memory_file(mocker, tmpdir): parse_memory_file(bad_process_object) # Test the parse_memory_file function - assert parse_memory_file(good_process_object) == 1024 + assert parse_memory_file(good_process_object_v1) == 1024 + assert parse_memory_file(good_process_object_v2) == 666 @@ -182,7 +190,7 @@ def test_get_cgroup_paths(mocker): process = get_cgroup_paths("test_location/") assert (process.cgroup_memory_path == - "test_location/memory/test_name/memory.max_usage_in_bytes") + "test_location/memory/test_name/memory.stat") assert (process.cgroup_cpu_path == "test_location/cpu/test_name/cpuacct.usage") From e30a1a4de6e1580c43b180b97e92397a7c42d76c Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Thu, 23 Oct 2025 14:37:24 +0100 Subject: [PATCH 085/101] Linting --- tests/unit/scripts/test_profiler.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/unit/scripts/test_profiler.py b/tests/unit/scripts/test_profiler.py index 101a1894c1a..050b6c637dd 100644 --- a/tests/unit/scripts/test_profiler.py +++ b/tests/unit/scripts/test_profiler.py @@ -24,7 +24,6 @@ profile, Process) import pytest -from pathlib import Path from unittest import mock @@ -58,7 +57,6 @@ async def async_request(self, *a, **k): with pytest.raises(SystemExit) as excinfo: stop_profiler(process_object, 1) - assert excinfo.type == SystemExit assert excinfo.value.code == 0 @@ -98,7 +96,6 @@ def test_parse_memory_file(mocker, tmpdir): assert parse_memory_file(good_process_object_v2) == 666 - def test_parse_cpu_file(mocker, tmpdir): mem_file = tmpdir.join("memory_file.txt") From c52f7fb63a2d13687379f7aaf59516c2f620ffda Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Wed, 29 Oct 2025 08:47:37 +0000 Subject: [PATCH 086/101] Linting --- cylc/flow/scripts/profiler.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/cylc/flow/scripts/profiler.py b/cylc/flow/scripts/profiler.py index 6dbf0e1673a..7b491fdc608 100755 --- a/cylc/flow/scripts/profiler.py +++ b/cylc/flow/scripts/profiler.py @@ -78,7 +78,11 @@ def stop_profiler(process, comms_timeout, *args): graphql_request_variables = { "WORKFLOWS": [os.environ.get('CYLC_WORKFLOW_ID')], - "MESSAGES": [["DEBUG", f"cpu_time {cpu_time} max_rss {max_rss} mem_alloc {memory_allocated}"]], + "MESSAGES": [[ + "DEBUG", + f"cpu_time {cpu_time} " + f"max_rss {max_rss} " + f"mem_alloc {memory_allocated}"]], "JOB": os.environ.get('CYLC_TASK_JOB'), "TIME": "now" } @@ -99,7 +103,6 @@ async def send_cylc_message(): def parse_memory_file(process: Process): """Open the memory stat file and copy the appropriate data""" - cgroup_memory_path = Path(process.cgroup_memory_path) if process.cgroup_version == 2: with open(process.cgroup_memory_path, 'r') as f: @@ -187,9 +190,9 @@ def get_cgroup_paths(location) -> Process: if cgroup_version == 2: return Process( cgroup_memory_path=location + - cgroup_name + "/" + "memory.stat", + cgroup_name + "/" + "memory.stat", cgroup_cpu_path=location + - cgroup_name + "/" + "cpu.stat", + cgroup_name + "/" + "cpu.stat", memory_allocated_path=location + cgroup_name, cgroup_version=cgroup_version, ) @@ -197,9 +200,9 @@ def get_cgroup_paths(location) -> Process: elif cgroup_version == 1: return Process( cgroup_memory_path=location + "memory/" + - cgroup_name + "/memory.stat", + cgroup_name + "/memory.stat", cgroup_cpu_path=location + "cpu/" + - cgroup_name + "/cpuacct.usage", + cgroup_name + "/cpuacct.usage", memory_allocated_path="", cgroup_version=cgroup_version, ) From 19ba481a90d1d2292cd95c4f0c71cc33d57469a7 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Wed, 29 Oct 2025 09:41:02 +0000 Subject: [PATCH 087/101] Updating unit tests --- cylc/flow/scripts/profiler.py | 8 ++++---- tests/functional/jobscript/04-profiler-e2e.t | 3 ++- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/cylc/flow/scripts/profiler.py b/cylc/flow/scripts/profiler.py index 7b491fdc608..ad34b9d192a 100755 --- a/cylc/flow/scripts/profiler.py +++ b/cylc/flow/scripts/profiler.py @@ -190,9 +190,9 @@ def get_cgroup_paths(location) -> Process: if cgroup_version == 2: return Process( cgroup_memory_path=location + - cgroup_name + "/" + "memory.stat", + cgroup_name + "/" + "memory.stat", cgroup_cpu_path=location + - cgroup_name + "/" + "cpu.stat", + cgroup_name + "/" + "cpu.stat", memory_allocated_path=location + cgroup_name, cgroup_version=cgroup_version, ) @@ -200,9 +200,9 @@ def get_cgroup_paths(location) -> Process: elif cgroup_version == 1: return Process( cgroup_memory_path=location + "memory/" + - cgroup_name + "/memory.stat", + cgroup_name + "/memory.stat", cgroup_cpu_path=location + "cpu/" + - cgroup_name + "/cpuacct.usage", + cgroup_name + "/cpuacct.usage", memory_allocated_path="", cgroup_version=cgroup_version, ) diff --git a/tests/functional/jobscript/04-profiler-e2e.t b/tests/functional/jobscript/04-profiler-e2e.t index 053ba348e82..82436a8e214 100644 --- a/tests/functional/jobscript/04-profiler-e2e.t +++ b/tests/functional/jobscript/04-profiler-e2e.t @@ -30,7 +30,8 @@ set_test_number 7 mkdir -p "${PWD}/cgroups_test_data" -echo '12345678' > cgroups_test_data/memory.peak +echo '12345678' > cgroups_test_data/memory.stat +echo '123456789' > cgroups_test_data/memory.max printf "blah blah 123456\nusage_usec 56781234" > cgroups_test_data/cpu.stat export profiler_test_env_var='/cgroups_test_data' From 48bb9c4d67005cba285b6a04bec3808b5179d36d Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Wed, 29 Oct 2025 10:14:01 +0000 Subject: [PATCH 088/101] Linting --- cylc/flow/scripts/profiler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cylc/flow/scripts/profiler.py b/cylc/flow/scripts/profiler.py index ad34b9d192a..afa603e21b3 100755 --- a/cylc/flow/scripts/profiler.py +++ b/cylc/flow/scripts/profiler.py @@ -161,7 +161,7 @@ def get_cgroup_version(cgroup_location: str, cgroup_name: str) -> int: cgroup_location + cgroup_name) -def get_cgroup_name() -> str: +def get_cgroup_name(): """Get the cgroup directory for the current process""" # fugly hack to allow functional tests to use test data From fb0a1568e11861c09872ba4d09adcf590b131aa6 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Wed, 29 Oct 2025 11:49:13 +0000 Subject: [PATCH 089/101] Adding unit test coverage --- cylc/flow/scripts/profiler.py | 27 +++++++++++++++------------ tests/unit/scripts/test_profiler.py | 17 +++++++++++++++++ 2 files changed, 32 insertions(+), 12 deletions(-) diff --git a/cylc/flow/scripts/profiler.py b/cylc/flow/scripts/profiler.py index afa603e21b3..68d31a416ed 100755 --- a/cylc/flow/scripts/profiler.py +++ b/cylc/flow/scripts/profiler.py @@ -53,18 +53,8 @@ class Process: def stop_profiler(process, comms_timeout, *args): """This function will be executed when the SIGINT signal is sent to this process""" - # If a task fails instantly, or finishes very quickly (< 1 second), - # the get config function doesn't have time to run - if (process.cgroup_memory_path is None - or process.cgroup_cpu_path is None - or process.memory_allocated_path is None): - max_rss = 0 - cpu_time = 0 - memory_allocated = 0 - else: - max_rss = parse_memory_file(process) - cpu_time = parse_cpu_file(process) - memory_allocated = parse_memory_allocated(process) + + max_rss, cpu_time, memory_allocated = get_resource_usage(process) graphql_mutation = """ mutation($WORKFLOWS: [WorkflowID]!, @@ -101,6 +91,19 @@ async def send_cylc_message(): sys.exit(0) +def get_resource_usage(process): + # If a task fails instantly, or finishes very quickly (< 1 second), + # the get config function doesn't have time to run + if (process.cgroup_memory_path is None + or process.cgroup_cpu_path is None + or process.memory_allocated_path is None): + return 0, 0, 0 + max_rss = parse_memory_file(process) + cpu_time = parse_cpu_file(process) + memory_allocated = parse_memory_allocated(process) + return max_rss, cpu_time, memory_allocated + + def parse_memory_file(process: Process): """Open the memory stat file and copy the appropriate data""" diff --git a/tests/unit/scripts/test_profiler.py b/tests/unit/scripts/test_profiler.py index 050b6c637dd..b014986e72f 100644 --- a/tests/unit/scripts/test_profiler.py +++ b/tests/unit/scripts/test_profiler.py @@ -20,6 +20,7 @@ get_cgroup_name, get_cgroup_version, get_cgroup_paths, + get_resource_usage, stop_profiler, profile, Process) @@ -61,6 +62,22 @@ async def async_request(self, *a, **k): assert excinfo.value.code == 0 +def test_get_resource_usage(mocker, monkeypatch, tmpdir): + monkeypatch.setenv('CYLC_WORKFLOW_ID', "test_value") + + process_object = Process( + cgroup_memory_path=None, + cgroup_cpu_path=None, + memory_allocated_path=None, + cgroup_version=1) + + max_rss, cpu_time, memory_allocated = get_resource_usage(process_object) + + assert max_rss == 0 + assert cpu_time == 0 + assert memory_allocated == 0 + + def test_parse_memory_file(mocker, tmpdir): mem_file_v1 = tmpdir.join("memory_file_v1.txt") From ba94659eecbb9f8c74ad8277709f94408169f75d Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Wed, 29 Oct 2025 13:23:07 +0000 Subject: [PATCH 090/101] Adding unit test coverage --- tests/unit/scripts/test_profiler.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/tests/unit/scripts/test_profiler.py b/tests/unit/scripts/test_profiler.py index b014986e72f..a57efff44f9 100644 --- a/tests/unit/scripts/test_profiler.py +++ b/tests/unit/scripts/test_profiler.py @@ -17,6 +17,7 @@ # Tests for functions contained in cylc.flow.scripts.profiler from cylc.flow.scripts.profiler import (parse_memory_file, parse_cpu_file, + parse_memory_allocated, get_cgroup_name, get_cgroup_version, get_cgroup_paths, @@ -159,6 +160,34 @@ def test_get_cgroup_name(mocker): mocker.patch("builtins.open", mock_file) assert get_cgroup_name() == "good/cgroup/place/2222222" +def test_parse_memory_allocated(mocker, tmpdir): + mem_allocated_file = tmpdir.join("memory.max") + mem_allocated_file.write('99999') + + # We curently do not track memory allocated for cgroups v1 + good_process_object_v1 = Process( + cgroup_memory_path='', + cgroup_cpu_path='', + memory_allocated_path=tmpdir, + cgroup_version=1) + + good_process_object_v2 = Process( + cgroup_memory_path='', + cgroup_cpu_path='', + memory_allocated_path=tmpdir, + cgroup_version=2) + + bad_process_object_v2 = Process( + cgroup_memory_path='', + cgroup_cpu_path='', + memory_allocated_path='/', + cgroup_version=2) + + assert parse_memory_allocated(good_process_object_v1) == 0 + assert parse_memory_allocated(good_process_object_v2) == 99999 + with pytest.raises(FileNotFoundError): + parse_memory_file(bad_process_object_v2) + def test_get_cgroup_name_file_not_found(mocker): From ed7f4fc1d6c3920d56b28d29c6e1d65c282de27c Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Wed, 29 Oct 2025 13:24:35 +0000 Subject: [PATCH 091/101] Adding unit test coverage --- tests/unit/scripts/test_profiler.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/unit/scripts/test_profiler.py b/tests/unit/scripts/test_profiler.py index a57efff44f9..db2ea9a88b1 100644 --- a/tests/unit/scripts/test_profiler.py +++ b/tests/unit/scripts/test_profiler.py @@ -160,6 +160,7 @@ def test_get_cgroup_name(mocker): mocker.patch("builtins.open", mock_file) assert get_cgroup_name() == "good/cgroup/place/2222222" + def test_parse_memory_allocated(mocker, tmpdir): mem_allocated_file = tmpdir.join("memory.max") mem_allocated_file.write('99999') From 5de1e27e275bbedf706c70b9ebb7a2ab35ad90c8 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Wed, 29 Oct 2025 14:10:31 +0000 Subject: [PATCH 092/101] Adding unit test coverage --- tests/unit/scripts/test_profiler.py | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/tests/unit/scripts/test_profiler.py b/tests/unit/scripts/test_profiler.py index db2ea9a88b1..62ee9ec3319 100644 --- a/tests/unit/scripts/test_profiler.py +++ b/tests/unit/scripts/test_profiler.py @@ -122,6 +122,9 @@ def test_parse_cpu_file(mocker, tmpdir): cpu_file_v1.write('1234567890') cpu_file_v2 = tmpdir.join("cpu_file_v2.txt") cpu_file_v2.write('usage_usec=1234567890') + cpu_file_v2_bad = tmpdir.join("cpu_file_v2_bad.txt") + cpu_file_v2_bad.write('Give me fuel, give me fire, ' + 'give me that which I desire') mem_allocated_file = tmpdir.join("memory_allocated.txt") mem_allocated_file.write('99999') @@ -140,14 +143,19 @@ def test_parse_cpu_file(mocker, tmpdir): cgroup_cpu_path='', memory_allocated_path='', cgroup_version=1) - - with pytest.raises(FileNotFoundError): - parse_cpu_file(bad_process_object) + bad_process_object_v2 = Process( + cgroup_memory_path=mem_file, + cgroup_cpu_path=cpu_file_v2_bad, + memory_allocated_path=mem_allocated_file, + cgroup_version=2) assert parse_cpu_file(good_process_object_v1) == 1234 - assert parse_cpu_file(good_process_object_v2) == 1234567 + with pytest.raises(FileNotFoundError): + parse_cpu_file(bad_process_object) + with pytest.raises(ValueError): + parse_cpu_file(bad_process_object_v2) def test_get_cgroup_name(mocker): @@ -165,7 +173,7 @@ def test_parse_memory_allocated(mocker, tmpdir): mem_allocated_file = tmpdir.join("memory.max") mem_allocated_file.write('99999') - # We curently do not track memory allocated for cgroups v1 + # We currently do not track memory allocated for cgroups v1 good_process_object_v1 = Process( cgroup_memory_path='', cgroup_cpu_path='', @@ -238,6 +246,13 @@ def test_get_cgroup_paths(mocker): assert (process.cgroup_cpu_path == "test_location/cpu/test_name/cpuacct.usage") + mocker.patch("cylc.flow.scripts.profiler.get_cgroup_name", + return_value='test_name') + mocker.patch("cylc.flow.scripts.profiler.get_cgroup_version", + return_value=3) + with pytest.raises(ValueError): + get_cgroup_paths("test_location/") + def test_profile_data(mocker): # This test should run without error From 2762d567661cfa36de2131309114d7861fa9f30c Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Wed, 29 Oct 2025 14:13:12 +0000 Subject: [PATCH 093/101] Linting --- tests/unit/scripts/test_profiler.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/unit/scripts/test_profiler.py b/tests/unit/scripts/test_profiler.py index 62ee9ec3319..7f5d6837ff6 100644 --- a/tests/unit/scripts/test_profiler.py +++ b/tests/unit/scripts/test_profiler.py @@ -124,7 +124,7 @@ def test_parse_cpu_file(mocker, tmpdir): cpu_file_v2.write('usage_usec=1234567890') cpu_file_v2_bad = tmpdir.join("cpu_file_v2_bad.txt") cpu_file_v2_bad.write('Give me fuel, give me fire, ' - 'give me that which I desire') + 'give me that which I desire') mem_allocated_file = tmpdir.join("memory_allocated.txt") mem_allocated_file.write('99999') @@ -157,6 +157,7 @@ def test_parse_cpu_file(mocker, tmpdir): with pytest.raises(ValueError): parse_cpu_file(bad_process_object_v2) + def test_get_cgroup_name(mocker): mock_file = mocker.mock_open(read_data="0::bad/test/cgroup/place") From e1978c142cb9ae7f01b8be4c9bb2732e57677c38 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Fri, 31 Oct 2025 08:52:24 +0000 Subject: [PATCH 094/101] Linting --- cylc/flow/scripts/profiler.py | 1 - 1 file changed, 1 deletion(-) diff --git a/cylc/flow/scripts/profiler.py b/cylc/flow/scripts/profiler.py index 68d31a416ed..5e3b4b1e89b 100755 --- a/cylc/flow/scripts/profiler.py +++ b/cylc/flow/scripts/profiler.py @@ -36,7 +36,6 @@ from cylc.flow.terminal import cli_function -INTERNAL = True PID_REGEX = re.compile(r"([^:]*\d{6,}.*)") RE_INT = re.compile(r'\d+') From 61f0ceb4aabcf619e86f2e21b1e7445cfee4a5c3 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Fri, 31 Oct 2025 09:11:49 +0000 Subject: [PATCH 095/101] Unit test coverage --- tests/unit/scripts/test_profiler.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/tests/unit/scripts/test_profiler.py b/tests/unit/scripts/test_profiler.py index 7f5d6837ff6..32eb292ddbd 100644 --- a/tests/unit/scripts/test_profiler.py +++ b/tests/unit/scripts/test_profiler.py @@ -271,3 +271,28 @@ def test_profile_data(mocker): return_value=2048) run_once = mock.Mock(side_effect=[True, False]) profile(process, 1, run_once) + + +@pytest.fixture +def options(mocker): + opts = mocker.Mock() + opts.cgroup_location = "/fake/path" + opts.comms_timeout = 10 + opts.delay = 1 + return opts + +def test_main(mocker, options): + mock_get_cgroup_paths = mocker.patch( + "cylc.flow.scripts.profiler.get_cgroup_paths" + ) + mock_signal = mocker.patch("cylc.flow.scripts.profiler.signal.signal") + mock_profile = mocker.patch("cylc.flow.scripts.profiler.profile") + + mock_get_cgroup_paths.return_value = mocker.Mock() + + from cylc.flow.scripts.profiler import _main + _main(options) + + mock_get_cgroup_paths.assert_called_once_with("/fake/path") + assert mock_signal.call_count == 3 + mock_profile.assert_called_once() From 2b27c2583ba3144d2ad808d12429fe332cc3ab95 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Fri, 31 Oct 2025 09:14:10 +0000 Subject: [PATCH 096/101] Linting --- tests/unit/scripts/test_profiler.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/unit/scripts/test_profiler.py b/tests/unit/scripts/test_profiler.py index 32eb292ddbd..10e6f7df3fb 100644 --- a/tests/unit/scripts/test_profiler.py +++ b/tests/unit/scripts/test_profiler.py @@ -281,6 +281,7 @@ def options(mocker): opts.delay = 1 return opts + def test_main(mocker, options): mock_get_cgroup_paths = mocker.patch( "cylc.flow.scripts.profiler.get_cgroup_paths" From f26ed40e251254513a15be24a1e2fe89967f71b0 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Wed, 19 Nov 2025 14:41:11 +0000 Subject: [PATCH 097/101] Adding test coverage --- cylc/flow/scripts/profiler.py | 21 ++++---- tests/unit/scripts/test_profiler.py | 76 +++++++++++++++++++++++------ 2 files changed, 70 insertions(+), 27 deletions(-) diff --git a/cylc/flow/scripts/profiler.py b/cylc/flow/scripts/profiler.py index 5e3b4b1e89b..8e2bc5c52d6 100755 --- a/cylc/flow/scripts/profiler.py +++ b/cylc/flow/scripts/profiler.py @@ -122,20 +122,15 @@ def parse_memory_allocated(process: Process) -> int: """Open the memory stat file and copy the appropriate data""" if process.cgroup_version == 2: cgroup_memory_path = Path(process.memory_allocated_path) - for i in range(5): with open(cgroup_memory_path / "memory.max", 'r') as f: line = f.readline() if "max" not in line: return int(line) cgroup_memory_path = cgroup_memory_path.parent - if i == 5: - break - elif process.cgroup_version == 1: - return 0 # Memory limit not tracked for cgroups v1 - - raise FileNotFoundError("Could not find memory.max file") - + return 0 + else : # Memory limit not tracked for cgroups v1 + return 0 def parse_cpu_file(process: Process) -> int: """Open the CPU stat file and return the appropriate data""" @@ -147,10 +142,12 @@ def parse_cpu_file(process: Process) -> int: raise ValueError("Unable to find cpu usage data") else: with open(process.cgroup_cpu_path, 'r') as f: - for line in f: - # Cgroups v1 uses nanoseconds - return int(line) // 1000000 - raise ValueError("Unable to find cpu usage data") + try: + for line in f: + # Cgroups v1 uses nanoseconds + return int(line) // 1000000 + except ValueError: + raise ValueError("Unable to find cpu usage data") def get_cgroup_version(cgroup_location: str, cgroup_name: str) -> int: diff --git a/tests/unit/scripts/test_profiler.py b/tests/unit/scripts/test_profiler.py index 10e6f7df3fb..6ab52a33143 100644 --- a/tests/unit/scripts/test_profiler.py +++ b/tests/unit/scripts/test_profiler.py @@ -118,10 +118,12 @@ def test_parse_cpu_file(mocker, tmpdir): mem_file = tmpdir.join("memory_file.txt") mem_file.write('1024') - cpu_file_v1 = tmpdir.join("cpu_file_v1.txt") - cpu_file_v1.write('1234567890') - cpu_file_v2 = tmpdir.join("cpu_file_v2.txt") - cpu_file_v2.write('usage_usec=1234567890') + cpu_file_v1_good = tmpdir.join("cpu_file_v1_good.txt") + cpu_file_v1_good.write('1234567890') + cpu_file_v1_bad = tmpdir.join("cpu_file_v1_bad.txt") + cpu_file_v1_bad.write("I'm your dream, mind ashtray") + cpu_file_v2_good = tmpdir.join("cpu_file_v2_good.txt") + cpu_file_v2_good.write('usage_usec=1234567890') cpu_file_v2_bad = tmpdir.join("cpu_file_v2_bad.txt") cpu_file_v2_bad.write('Give me fuel, give me fire, ' 'give me that which I desire') @@ -130,19 +132,24 @@ def test_parse_cpu_file(mocker, tmpdir): good_process_object_v1 = Process( cgroup_memory_path=mem_file, - cgroup_cpu_path=cpu_file_v1, + cgroup_cpu_path=cpu_file_v1_good, memory_allocated_path=mem_allocated_file, cgroup_version=1) good_process_object_v2 = Process( cgroup_memory_path=mem_file, - cgroup_cpu_path=cpu_file_v2, + cgroup_cpu_path=cpu_file_v2_good, memory_allocated_path=mem_allocated_file, cgroup_version=2) - bad_process_object = Process( + bad_process_object_v1_1 = Process( cgroup_memory_path='', cgroup_cpu_path='', memory_allocated_path='', cgroup_version=1) + bad_process_object_v1_2 = Process( + cgroup_memory_path=mem_file, + cgroup_cpu_path=cpu_file_v1_bad, + memory_allocated_path=mem_allocated_file, + cgroup_version=1) bad_process_object_v2 = Process( cgroup_memory_path=mem_file, cgroup_cpu_path=cpu_file_v2_bad, @@ -153,7 +160,9 @@ def test_parse_cpu_file(mocker, tmpdir): assert parse_cpu_file(good_process_object_v2) == 1234567 with pytest.raises(FileNotFoundError): - parse_cpu_file(bad_process_object) + parse_cpu_file(bad_process_object_v1_1) + with pytest.raises(ValueError): + parse_cpu_file(bad_process_object_v1_2) with pytest.raises(ValueError): parse_cpu_file(bad_process_object_v2) @@ -170,24 +179,25 @@ def test_get_cgroup_name(mocker): assert get_cgroup_name() == "good/cgroup/place/2222222" -def test_parse_memory_allocated(mocker, tmpdir): - mem_allocated_file = tmpdir.join("memory.max") - mem_allocated_file.write('99999') +def test_parse_memory_allocated(tmp_path_factory): + good_mem_dir = tmp_path_factory.mktemp("mem_dir") + mem_allocated_file = good_mem_dir / "memory.max" + mem_allocated_file.write_text('99999') # We currently do not track memory allocated for cgroups v1 good_process_object_v1 = Process( cgroup_memory_path='', cgroup_cpu_path='', - memory_allocated_path=tmpdir, + memory_allocated_path=str(good_mem_dir), cgroup_version=1) good_process_object_v2 = Process( cgroup_memory_path='', cgroup_cpu_path='', - memory_allocated_path=tmpdir, + memory_allocated_path=str(good_mem_dir), cgroup_version=2) - bad_process_object_v2 = Process( + bad_process_object_v2_1 = Process( cgroup_memory_path='', cgroup_cpu_path='', memory_allocated_path='/', @@ -196,8 +206,44 @@ def test_parse_memory_allocated(mocker, tmpdir): assert parse_memory_allocated(good_process_object_v1) == 0 assert parse_memory_allocated(good_process_object_v2) == 99999 with pytest.raises(FileNotFoundError): - parse_memory_file(bad_process_object_v2) + parse_memory_file(bad_process_object_v2_1) + + # Nested directories with 'max' value + base_dir = tmp_path_factory.mktemp("base") + + dir_1 = base_dir / "dir_1" + dir_1.mkdir() + mem_file_1 = dir_1 / "memory.max" + mem_file_1.write_text("max") + + dir_2 = dir_1 / "dir_2" + dir_2.mkdir() + mem_file_2 = dir_2 / "memory.max" + mem_file_2.write_text("max") + + dir_3 = dir_2 / "dir_3" + dir_3.mkdir() + mem_file_3 = dir_3 / "memory.max" + mem_file_3.write_text("max") + + dir_4 = dir_3 / "dir_4" + dir_4.mkdir() + mem_file_4 = dir_4 / "memory.max" + mem_file_4.write_text("max") + + dir_5 = dir_4 / "dir_5" + dir_5.mkdir() + mem_file_5 = dir_5 / "memory.max" + mem_file_5.write_text("max") + + + bad_process_object_v2_2 = Process( + cgroup_memory_path='', + cgroup_cpu_path='', + memory_allocated_path=str(dir_5), + cgroup_version=2) + assert parse_memory_allocated(bad_process_object_v2_2) == 0 def test_get_cgroup_name_file_not_found(mocker): From 80c38d040812d981c5d7cbb78ce4ab53d9cd8b06 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Thu, 20 Nov 2025 13:38:07 +0000 Subject: [PATCH 098/101] Linting --- cylc/flow/scripts/profiler.py | 18 +++++++++--------- tests/unit/scripts/test_profiler.py | 4 ++-- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/cylc/flow/scripts/profiler.py b/cylc/flow/scripts/profiler.py index 8e2bc5c52d6..87c9c317d94 100755 --- a/cylc/flow/scripts/profiler.py +++ b/cylc/flow/scripts/profiler.py @@ -122,18 +122,20 @@ def parse_memory_allocated(process: Process) -> int: """Open the memory stat file and copy the appropriate data""" if process.cgroup_version == 2: cgroup_memory_path = Path(process.memory_allocated_path) - for i in range(5): + for _ in range(5): with open(cgroup_memory_path / "memory.max", 'r') as f: line = f.readline() if "max" not in line: return int(line) cgroup_memory_path = cgroup_memory_path.parent return 0 - else : # Memory limit not tracked for cgroups v1 + else: # Memory limit not tracked for cgroups v1 return 0 + def parse_cpu_file(process: Process) -> int: """Open the CPU stat file and return the appropriate data""" + if process.cgroup_version == 2: with open(process.cgroup_cpu_path, 'r') as f: for line in f: @@ -142,12 +144,11 @@ def parse_cpu_file(process: Process) -> int: raise ValueError("Unable to find cpu usage data") else: with open(process.cgroup_cpu_path, 'r') as f: - try: - for line in f: - # Cgroups v1 uses nanoseconds - return int(line) // 1000000 - except ValueError: - raise ValueError("Unable to find cpu usage data") + for line in f: + # Cgroups v1 uses nanoseconds + return int(line) // 1000000 + + raise ValueError("Unable to find cpu usage data") def get_cgroup_version(cgroup_location: str, cgroup_name: str) -> int: @@ -205,7 +206,6 @@ def get_cgroup_paths(location) -> Process: memory_allocated_path="", cgroup_version=cgroup_version, ) - raise ValueError("Unable to determine cgroup version") diff --git a/tests/unit/scripts/test_profiler.py b/tests/unit/scripts/test_profiler.py index 6ab52a33143..9aece7ce253 100644 --- a/tests/unit/scripts/test_profiler.py +++ b/tests/unit/scripts/test_profiler.py @@ -207,7 +207,7 @@ def test_parse_memory_allocated(tmp_path_factory): assert parse_memory_allocated(good_process_object_v2) == 99999 with pytest.raises(FileNotFoundError): parse_memory_file(bad_process_object_v2_1) - + # Nested directories with 'max' value base_dir = tmp_path_factory.mktemp("base") @@ -236,7 +236,6 @@ def test_parse_memory_allocated(tmp_path_factory): mem_file_5 = dir_5 / "memory.max" mem_file_5.write_text("max") - bad_process_object_v2_2 = Process( cgroup_memory_path='', cgroup_cpu_path='', @@ -245,6 +244,7 @@ def test_parse_memory_allocated(tmp_path_factory): assert parse_memory_allocated(bad_process_object_v2_2) == 0 + def test_get_cgroup_name_file_not_found(mocker): def mock_os_pid(): From c54cff53606718518bc1d5fe56350bc44d550586 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Thu, 20 Nov 2025 14:50:05 +0000 Subject: [PATCH 099/101] Implemented usage of CylcError --- cylc/flow/scripts/profiler.py | 15 +++++++++------ tests/unit/scripts/test_profiler.py | 12 +++++------- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/cylc/flow/scripts/profiler.py b/cylc/flow/scripts/profiler.py index 87c9c317d94..8a878c97f0d 100755 --- a/cylc/flow/scripts/profiler.py +++ b/cylc/flow/scripts/profiler.py @@ -31,9 +31,10 @@ from functools import partial from dataclasses import dataclass -from cylc.flow.network.client_factory import get_client -from cylc.flow.option_parsers import CylcOptionParser as COP +from cylc.flow.exceptions import CylcError from cylc.flow.terminal import cli_function +from cylc.flow.option_parsers import CylcOptionParser as COP +from cylc.flow.network.client_factory import get_client PID_REGEX = re.compile(r"([^:]*\d{6,}.*)") @@ -117,6 +118,8 @@ def parse_memory_file(process: Process): if "total_rss" in line: return int(''.join(filter(str.isdigit, line))) + raise CylcError("Unable to find memory usage data") + def parse_memory_allocated(process: Process) -> int: """Open the memory stat file and copy the appropriate data""" @@ -148,7 +151,7 @@ def parse_cpu_file(process: Process) -> int: # Cgroups v1 uses nanoseconds return int(line) // 1000000 - raise ValueError("Unable to find cpu usage data") + raise CylcError("Unable to find cpu usage data") def get_cgroup_version(cgroup_location: str, cgroup_name: str) -> int: @@ -177,8 +180,8 @@ def get_cgroup_name(): result = PID_REGEX.search(result).group() return result except FileNotFoundError as err: - raise FileNotFoundError( - '/proc/' + str(pid) + '/cgroup not found') from err + raise CylcError( + '/proc/' + str(pid) + '/cgroup not found') from None except AttributeError as err: raise AttributeError("No cgroup found for process:", pid) from err @@ -206,7 +209,7 @@ def get_cgroup_paths(location) -> Process: memory_allocated_path="", cgroup_version=cgroup_version, ) - raise ValueError("Unable to determine cgroup version") + raise CylcError("Unable to determine cgroup version") def profile(_process: Process, delay, keep_looping=lambda: True): diff --git a/tests/unit/scripts/test_profiler.py b/tests/unit/scripts/test_profiler.py index 9aece7ce253..6ec020c66e8 100644 --- a/tests/unit/scripts/test_profiler.py +++ b/tests/unit/scripts/test_profiler.py @@ -27,15 +27,13 @@ Process) import pytest from unittest import mock +from cylc.flow.exceptions import CylcError def test_stop_profiler(mocker, monkeypatch, tmpdir): monkeypatch.setenv('CYLC_WORKFLOW_ID', "test_value") - def mock_get_client(env_var, timeout=None): - return True - - class MockedClient(): + class MockedClient: def __init__(self, *a, **k): pass @@ -45,7 +43,7 @@ async def async_request(self, *a, **k): mocker.patch("cylc.flow.scripts.profiler.get_client", MockedClient) mem_file = tmpdir.join("memory_file.txt") - mem_file.write('1234') + mem_file.write('total_rss 1234') cpu_file = tmpdir.join("cpu_file.txt") cpu_file.write('5678') mem_allocated_file = tmpdir.join("memory_allocated.txt") @@ -251,7 +249,7 @@ def mock_os_pid(): return 'The Thing That Should Not Be' mocker.patch("os.getpid", mock_os_pid) - with pytest.raises(FileNotFoundError): + with pytest.raises(CylcError): get_cgroup_name() @@ -297,7 +295,7 @@ def test_get_cgroup_paths(mocker): return_value='test_name') mocker.patch("cylc.flow.scripts.profiler.get_cgroup_version", return_value=3) - with pytest.raises(ValueError): + with pytest.raises(CylcError): get_cgroup_paths("test_location/") From f5e5af8da661e466ea2bf39f01edc3b30e28e8fb Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Thu, 20 Nov 2025 14:52:07 +0000 Subject: [PATCH 100/101] Linting --- cylc/flow/scripts/profiler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cylc/flow/scripts/profiler.py b/cylc/flow/scripts/profiler.py index 8a878c97f0d..ef26ed35256 100755 --- a/cylc/flow/scripts/profiler.py +++ b/cylc/flow/scripts/profiler.py @@ -179,7 +179,7 @@ def get_cgroup_name(): result = f.read() result = PID_REGEX.search(result).group() return result - except FileNotFoundError as err: + except FileNotFoundError: raise CylcError( '/proc/' + str(pid) + '/cgroup not found') from None From 1598cd3f508919a6001d05f423a80c8eaee08329 Mon Sep 17 00:00:00 2001 From: "christopher.bennett" Date: Fri, 21 Nov 2025 13:23:49 +0000 Subject: [PATCH 101/101] Fix functional tests --- tests/functional/jobscript/04-profiler-e2e.t | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/functional/jobscript/04-profiler-e2e.t b/tests/functional/jobscript/04-profiler-e2e.t index 82436a8e214..8e960647233 100644 --- a/tests/functional/jobscript/04-profiler-e2e.t +++ b/tests/functional/jobscript/04-profiler-e2e.t @@ -30,7 +30,7 @@ set_test_number 7 mkdir -p "${PWD}/cgroups_test_data" -echo '12345678' > cgroups_test_data/memory.stat +echo 'anon 12345678' > cgroups_test_data/memory.stat echo '123456789' > cgroups_test_data/memory.max printf "blah blah 123456\nusage_usec 56781234" > cgroups_test_data/cpu.stat