| Copy this cmu_user_script to the CMU
management node:
#!/usr/bin/perl -w
#
# cmu_user_script
#
# This script periodically scans the LSF/SLURM/PBS job queues
# and creates CMU user groups for each currently running job.
#
# These CMU variables should not need changing.
my $CMU_SHOW_GROUPS = "/opt/cmu/bin/cmu_show_user_groups";
my $CMU_ADD_GROUP = "/opt/cmu/bin/cmu_add_user_group";
my $CMU_ADD_NODES = "/opt/cmu/bin/cmu_add_to_user_group";
my $CMU_DEL_GROUP = "/opt/cmu/bin/cmu_del_user_group";
my $CMU_SHOW_NODES = "/opt/cmu/bin/cmu_show_nodes";
# The following line is placed ahead of the
# LSFJOBS command to setup the LSF environment.
my $LSFENV = "/bin/env LSF_ENVDIR=/etc/lava/conf LSF_SERVERDIR=/usr/sbin";
# The following variables must be full path: they are tested
# for existence.
# Make sure these paths are correct!
# Or set them to a bogus path to disable the check.
my $SQUEUE = "/usr/local/bin/squeue";
my $LSFJOBS = "/usr/bin/bjobs";
my $PBSJOBS = "/usr/bin/qstat";
# SLEEP_INTERVAL is the number of seconds of sleep before
# checking the queues again.
my $SLEEP_INTERVAL = 2;
###########################################
# SHOULD NOT NEED TO EDIT BELOW THIS LINE
###########################################
# function declarations
sub expand($);
sub addSLURMjobsToHash();
sub addLSFjobsToHash();
sub addPBSjobsToHash();
# global variables
my %jobhash;
my @cmu_nodes;
sub get_cmu_nodes() {
@cmu_nodes = split /\s+/, `$CMU_SHOW_NODES`;
}
sub is_cmu_node($) {
my ($node) = (@_);
foreach my $cn (@cmu_nodes) {
return 1 if ($cn eq $node);
}
return 0;
}
while (1) {
#
# Get existing CMU Nodes and User Groups
#
get_cmu_nodes();
my $curgroups = `$CMU_SHOW_GROUPS`;
my @cmugroups = split /\n/, $curgroups;
#
# Reload the jobhash
#
delete @jobhash{keys %jobhash};
addSLURMjobsToHash() if ( -f "$SQUEUE");
addLSFjobsToHash() if ( -f "$LSFJOBS");
addPBSjobsToHash() if ( -f "$PBSJOBS");
my @jlist = keys %jobhash;
#
# delete old user groups that begin with
# 'SLURM_', 'LSF_', or 'PBS_'
#
my @foundgroups = ();
foreach my $g (@cmugroups) {
my $found = 0;
foreach my $j (@jlist) {
if ($g eq $j) {
$found = 1;
push @foundgroups, $g;
last;
}
}
unless ($found) {
my @items = split /_/, $g;
if ($items[0] eq "SLURM" || $items[0] eq "LSF" ||
$items[0] eq "PBS") {
`$CMU_DEL_GROUP $g`;
}
}
}
#
# add new user groups
#
foreach my $j (@jlist) {
# check for existing group
my $found = 0;
foreach my $g (@foundgroups) {
if ($g eq $j) {
# FIXME confirm/update the node list
$found = 1;
last;
}
}
next if ($found);
# create the CMU user group and add the nodelist to it
`$CMU_ADD_GROUP $j`;
`$CMU_ADD_NODES -t $j $jobhash{$j}` if ($jobhash{$j} ne "");
}
sleep $SLEEP_INTERVAL;
}
sub addSLURMjobsToHash() {
#
# If SLURM is
not running on the local node then
# prepend the
$SQUEUE command below with 'ssh nX'
#
my $curjobs =
`$SQUEUE -t R -h -o "%i %u %N"`;
my @slist =
split /\n/, $curjobs;
foreach (@slist) {
my @elements = split /\s+/;
next unless (@elements == 3);
# name = SLURM_{USER}_{JOBID}
my $jname = 'SLURM_' . $elements[1] . '_' . $elements[0];
my $nodestr = "";
foreach my $n (expand($elements[2])) {
$nodestr .= " $n" if (is_cmu_node($n));;
}
$jobhash{$jname} = $nodestr;
}
}
sub addLSFjobsToHash() {
my $curjobs = `$LSFENV $LSFJOBS -r -u all -w`;
my @jstrs = split /\n/, $curjobs;
my $jname = "";
my @nodestr = "";
foreach (@jstrs) {
my @elements = split /\s+/;
if (@elements == 1 && $jname ne "") {
# This must be a node from the current job
# if LSB_SHORT_HOSTLIST is enabled, then
# remove the prepended cpu count
my @items = split /\*/, $elements[0];
# check for duplicates
my @check = split /\s+/, $nodestr;
my $found = 0;
foreach my $n (@check) {
if ($n eq $items[-1]) {
$found = 1;
last;
}
}
if (!$found && is_cmu_node($items[-1])) {
$nodestr .= " $items[-1]";
}
next;
}
if (@elements > 7 && $elements[2] eq "RUN") {
if ($jname) {
# we've found a new job so add the
# current job to the hash
$jobhash{$jname} = $nodestr;
}
# name = LSF_{user}_{jobid}[_{array_id}]
$jname = "LSF_" . $elements[1] . '_' . $elements[0];
# check for an array ID, i.e. 'foo[array ID]'
my $name = $elements[6];
my $lastchar = chop $name;
if ($lastchar eq "]") {
@namebits = split /\[/, $name;
$jname .= '_' . $namebits[-1];
}
# get the node, removing any prepended cpu count
$nodestr = "";
my @nodes = split /:/, $elements[5];
foreach my $n (@nodes) {
my @items = split /\*/, $n;
$nodestr .= " $items[-1]"
if (is_cmu_node($items[-1]));
}
}
}
if ($jname) {
$jobhash{$jname} = $nodestr;
}
}
sub addPBSjobsToHash() {
my $curjobs = `$PBSJOBS -n -1`;
my @jlist = split /\n/, $curjobs;
foreach (@jlist) {
my @elements = split /\s+/;
next unless (@elements == 12 && $elements[0] ne "Job");
# need to extract jobid number
my @items = split /\./, $elements[0];
my $jobid = $items[0];
# need to extract list of nodes
my @nodes;
@items = split /\+/, $elements[11];
foreach my $i (@items) {
my @hostinfo = split /\//, $i;
next unless (is_cmu_node($hostinfo[0]));
my $found = 0;
foreach my $n (@nodes) {
if ($hostinfo[0] eq $n) {
$found = 1;
last;
}
}
push @nodes, $hostinfo[0] unless ($found);
}
next unless (@nodes > 0);
my $nodestr = join ' ', @nodes;
# name = PBS_{USER}_{JOBID}
my $jname = 'PBS_' . $elements[1] . '_' . $jobid;
$jobhash{$jname} = $nodestr;
}
}
sub expand ($) {
my ($selector) = @_;
my @nodes;
while (1) {
#
first look for a range
my @parts = split /\[/, $selector, 2;
if (@parts < 2) {
# no range, so just look for comma-separated nodes
my @strs = split /\,/, $selector;
foreach my $p (@strs) {
push @nodes, $p;
}
last;
}
#
we have the beginning of a range
#
check for comma-separated nodes preceding the range
my @items = split /\,/, $parts[0];
my $prefix = pop @items;
foreach my $i (@items) {
push @nodes, $i;
}
#
process the range
@items = split /\]/, $parts[1], 2;
my @pieces = split /\,/, $items[0];
foreach (@pieces) {
if (/(.+)-(.+)/) {
push @nodes, (map "${prefix}${_}", ($1 .. $2));
next;
}
push @nodes, $prefix . $_;
}
# prepare the rest of the string or return
if (@items > 1 && length $items[1] > 0) {
$selector = $items[1];
# remove any leading commas
my @check = split /\,/, $selector, 2;
$selector = $check[1] if (length $check[0] == 0);
next;
}
last;
}
return @nodes;
}
This script expects to be able to execute LSF, SLURM, and/or PBS commands
from the CMU management node. Edit the variables at the top of the script as
appropriate. Then make this script executable and run it on the CMU management
node.
As jobs are dispatched by the job scheduler, CMU user groups will become
available for monitoring in the CMU GUI. When the jobs finish, the groups will
disappear. |