This patch overhauls the way that memory resource usage is managed:
- Both vmem and mem refer to RSS memory now, not total virtual size.
vmem reports memory usage by all processes in that job on the node.
mem is the sum of vmem across all the nodes in the job.
- Kernel-based setrlimit limits are not used for vmem or mem. Without
this patch, stack size would be limited by a default vmem, for instance.
- Memory that is shared, either by a group of threads or by processes
using some other shared memory mechanism, is not overaccounted. Only one
copy of the shared memory is used in determining the total usage.
---
pbs-2.3.12/src/resmom/linux/mom_mach.c | 226 ++++++++++++++++++++++-----------
pbs-2.3.12/src/resmom/linux/mom_mach.h | 5
2 files changed, 158 insertions(+), 73 deletions(-)
diff -puN src/resmom/linux/mom_mach.c~vmem-accounting src/resmom/linux/mom_mach.c
--- pbs-2.3.12-orig/src/resmom/linux/mom_mach.c~vmem-accounting 2004-08-06 17:52:11.000000000 -0400
+++ pbs-2.3.12/src/resmom/linux/mom_mach.c 2004-08-06 17:52:11.000000000 -0400
@@ -235,8 +235,8 @@ void proc_get_btime()
}
static const char stat_str[] =
-/* pid cmd state xx xx sessid xx xx flags xx */
-"%d (%[^)]) %c %*d %*d %d %*d %*d %u %*u "
+/* pid cmd state ppid xx sessid xx xx flags xx */
+"%d (%[^)]) %c %d %*d %d %*d %*d %u %*u "
/* xx xx xx utime stime cutime cstime xx xx xx xx start_time vsize rss */
"%*u %*u %*u %d %d %d %d %*d %*d %*u %*u %u %u %u";
@@ -267,10 +267,10 @@ get_proc_stat(int pid)
if ((fd = fopen(path, "r")) == NULL)
return(NULL);
- if (fscanf(fd, stat_str, &ps.pid, path, &ps.state,
+ if (fscanf(fd, stat_str, &ps.pid, path, &ps.state, &ps.ppid,
&ps.session, &ps.flags, &ps.utime, &ps.stime,
&ps.cutime, &ps.cstime, &jiffies, &ps.vsize,
- &ps.rss) != 12) {
+ &ps.rss) != 13) {
fclose(fd);
return(NULL);
}
@@ -280,7 +280,30 @@ get_proc_stat(int pid)
} else {
ps.uid = sb.st_uid;
}
+ fclose(fd);
+ /* read statm for shared memory */
+ ps.shm = 0;
+ sprintf(path, "/proc/%d/statm", pid);
+ fd = fopen(path, "r");
+ if (fd) {
+ fscanf(fd, "%*d %*d %d", &ps.shm);
+ fclose(fd);
+ }
+
+ /* read status for tgid */
+ ps.tgid = 0;
+ sprintf(path, "/proc/%d/status", pid);
+ fd = fopen(path, "r");
+ if (fd) {
+ while (fgets(path, sizeof(path), fd)) {
+ if (strncmp(path, "Tgid:", 5) == 0) {
+ sscanf(path, "Tgid:\t%d", &ps.tgid);
+ break;
+ }
+ }
+ fclose(fd);
+ }
ps.start_time = linux_time + (jiffies / 100);
ps.name = path;
@@ -290,7 +313,6 @@ get_proc_stat(int pid)
JTOS(ps.cutime)
JTOS(ps.cstime)
- fclose(fd);
return(&ps);
}
@@ -534,57 +556,26 @@ static int overcpu_proc(pjob, limit)
}
/*
- * Internal session memory usage function.
- *
- * Returns the total number of bytes of address
- * space consumed by all current processes within the job.
- */
-static unsigned long mem_sum(pjob)
- job *pjob;
-{
- char *id="mem_sum";
- struct dirent *dent;
- char procname[100];
- int num, i;
- unsigned long segadd;
- proc_stat_t *ps;
-
- segadd = 0;
- rewinddir(pdir);
-
- while ((dent = readdir(pdir)) != NULL) {
- if (!isdigit(dent->d_name[0]))
- continue;
-
- if ((ps = get_proc_stat(atoi(dent->d_name))) == NULL) {
- if (errno != ENOENT) {
- sprintf(log_buffer,
- "%s: get_proc_stat", dent->d_name);
- log_err(errno, id, log_buffer);
- }
- continue;
- }
-
- if (!injob(pjob, ps->session))
- continue;
- segadd += ps->vsize;
- }
-
- return (segadd);
-}
-
-/*
* Internal session workingset size function.
*/
-static unsigned long resi_sum(pjob)
- job *pjob;
+static unsigned long
+resi_sum(job *pjob)
{
char *id="resi_sum";
- ulong resisize;
+ ulong rss, resisize, max_shm;
struct dirent *dent;
proc_stat_t *ps;
- resisize = 0;
+ /* must keep a table of all to cull threads */
+ static struct {
+ unsigned int pid, ppid, tgid;
+ unsigned long rss, shm, vsize;
+ int thread;
+ } *pids;
+ static int maxpids = 0;
+ int i, j, numpids;
+
+ numpids = 0;
rewinddir(pdir);
while ((dent = readdir(pdir)) != NULL) {
if (!isdigit(dent->d_name[0]))
@@ -602,9 +593,108 @@ static unsigned long resi_sum(pjob)
if (!injob(pjob, ps->session))
continue;
- resisize += ps->rss * pagesize;
+ /*
+ * This is an aggregated total, but for processes that are
+ * threads, or for processes that use shared memory (posix or
+ * sysv), we don't want to double-count the shared areas.
+ *
+ * Threads: all but the first thread has PF_FORKNOEXEC flag
+ * set, but statm shows hardly any memory in shared. This is a
+ * problem, hence we go to the complexity of guessing at
+ * thread-ness as do ps and top.
+ *
+ * Fork with shmem: also have PF_FORKNOEXEC, but shared field
+ * of statm will be large in each process.
+ *
+ * Fork & exec with shmem: none have PF_FORKNOEXEC, but
+ * significant amount of shared memory.
+ *
+ * Fork, no shmem usage: also have PF_FORKNOEXEC, but small
+ * shared memory usage. This code misdiagnoses these forked
+ * processes as threads, unfortunately, and underreports the
+ * memory usage. Hopefully this is not common.
+ *
+ * Another caveat: multiple independent codes which use shared
+ * memory will also be underreported. The different shared
+ * memory segments are not disambiguated and will appear as one
+ * large shared section. Parsing /proc/pid/maps to identify
+ * the shared memory segments may be the only way to fix this.
+ * This case also is likely rare.
+ *
+ * After filtering out the threads, we subtract out the shared
+ * memory component from the RSS of each process, add up that
+ * difference, then add back in one shared memory. This may
+ * undercount the case where different processes share
+ * different memory regions, but is good for the most general
+ * case of one big shared memory space for all processes in a
+ * job.
+ */
+
+ /* grow pids[] array over time */
+ if (numpids == maxpids) {
+ void *x = pids;
+ maxpids += 20;
+ pids = malloc(maxpids * sizeof(*pids));
+ if (!pids) {
+ log_err(0, id, "alloc more pids");
+ exit(1);
+ }
+ if (numpids) {
+ memcpy(pids, x, numpids * sizeof(*pids));
+ free(x);
+ }
+ }
+
+ pids[numpids].pid = ps->pid;
+ pids[numpids].ppid = ps->ppid;
+ pids[numpids].tgid = ps->tgid;
+ pids[numpids].rss = ps->rss;
+ pids[numpids].shm = ps->shm;
+ pids[numpids].vsize = ps->vsize;
+ pids[numpids].thread = 0;
+ ++numpids;
}
+ /* cull threads out of the pids */
+ for (i=0; i max_shm)
+ max_shm = pids[i].shm;
+ if (pids[i].rss >= pids[i].shm)
+ rss = pids[i].rss - pids[i].shm;
+ else
+ rss = 0;
+ resisize += rss;
+ }
+ resisize += max_shm;
+ resisize *= pagesize;
+
return (resisize);
}
@@ -708,7 +798,6 @@ int mom_set_limits(pjob, set_mode)
unsigned long value; /* place in which to build resource value */
resource *pres;
struct rlimit reslim;
- unsigned long mem_limit = 0;
DBPRT(("%s: entered\n", id))
assert(pjob != NULL);
@@ -752,12 +841,8 @@ int mom_set_limits(pjob, set_mode)
if (setrlimit(RLIMIT_FSIZE, &reslim) < 0)
return (error(pname, PBSE_SYSTEM));
}
- } else if (strcmp(pname, "vmem") == 0) { /* check */
- retval = getsize(pres, &value);
- if (retval != PBSE_NONE)
- return (error(pname, retval));
- if ((mem_limit == 0) || (value < mem_limit))
- mem_limit = value;
+ } else if (strcmp(pname, "vmem") == 0) { /* ignore */
+ /* vmem monitoring dynamic, sum over procs of rss */
} else if (strcmp(pname, "pvmem") == 0) { /* set */
if (set_mode == SET_LIMIT_SET) {
retval = getsize(pres, &value);
@@ -765,10 +850,13 @@ int mom_set_limits(pjob, set_mode)
return (error(pname, retval));
if (value > INT_MAX)
return (error(pname, PBSE_BADATVAL));
- if ((mem_limit == 0) || (value < mem_limit))
- mem_limit = value;
+ if (setrlimit(RLIMIT_DATA, &reslim) < 0)
+ return (error("RLIMIT_DATA", PBSE_SYSTEM));
+ if (setrlimit(RLIMIT_STACK, &reslim) < 0)
+ return (error("RLIMIT_STACK", PBSE_SYSTEM));
}
} else if (strcmp(pname, "mem") == 0) { /* ignore */
+ /* also dynamic, sum over nodes of vmem */
} else if (strcmp(pname, "pmem") == 0) { /* set */
if (set_mode == SET_LIMIT_SET) {
retval = getsize(pres, &value);
@@ -794,16 +882,6 @@ int mom_set_limits(pjob, set_mode)
return (error(pname, PBSE_UNKRESC));
pres = (resource *)GET_NEXT(pres->rs_link);
}
- if (set_mode == SET_LIMIT_SET) {
- /* if either of vmem or pvmem was given, set sys limit to lesser */
- if (mem_limit != 0) {
- reslim.rlim_cur = reslim.rlim_max = mem_limit;
- if (setrlimit(RLIMIT_DATA, &reslim) < 0)
- return (error("RLIMIT_DATA", PBSE_SYSTEM));
- if (setrlimit(RLIMIT_STACK, &reslim) < 0)
- return (error("RLIMIT_STACK", PBSE_SYSTEM));
- }
- }
return (PBSE_NONE);
}
/*
@@ -917,10 +995,11 @@ int mom_over_limit(pjob)
return (TRUE);
}
} else if (strcmp(pname, "vmem") == 0) {
+ /* redefined to be resident mem, not vsize --pw */
retval = getsize(pres, &value);
if (retval != PBSE_NONE)
continue;
- if ((num = mem_sum(pjob)) > value) {
+ if ((num = resi_sum(pjob)) > value) {
sprintf(log_buffer,
"vmem %lu exceeded limit %lu",
num, value);
@@ -968,6 +1047,7 @@ int mom_set_use(pjob)
attribute *at;
resource_def *rd;
unsigned long *lp, lnum;
+ unsigned long resisum;
assert(pjob != NULL);
at = &pjob->ji_wattr[(int)JOB_ATR_resc_used];
@@ -1014,12 +1094,14 @@ int mom_set_use(pjob)
lnum = cput_sum(pjob);
*lp = MAX(*lp, lnum);
+ resisum = resi_sum(pjob);
+
rd = find_resc_def(svr_resc_def, "vmem", svr_resc_size);
assert(rd != NULL);
pres = find_resc_entry(at, rd);
assert(pres != NULL);
lp = &pres->rs_value.at_val.at_size.atsv_num;
- lnum = (mem_sum(pjob) + 1023) >> 10; /* as KB */
+ lnum = (resisum + 1023) >> 10; /* as KB */
*lp = MAX(*lp, lnum);
rd = find_resc_def(svr_resc_def, "walltime", svr_resc_size);
@@ -1033,7 +1115,7 @@ int mom_set_use(pjob)
pres = find_resc_entry(at, rd);
assert(pres != NULL);
lp = &pres->rs_value.at_val.at_size.atsv_num;
- lnum = (resi_sum(pjob) + 1023) >> 10; /* as KB */
+ lnum = (resisum + 1023) >> 10; /* as KB */
*lp = MAX(*lp, lnum);
return (PBSE_NONE);
diff -puN src/resmom/linux/mom_mach.h~vmem-accounting src/resmom/linux/mom_mach.h
--- pbs-2.3.12-orig/src/resmom/linux/mom_mach.h~vmem-accounting 2004-08-06 17:52:11.000000000 -0400
+++ pbs-2.3.12/src/resmom/linux/mom_mach.h 2004-08-06 17:52:11.000000000 -0400
@@ -122,10 +122,13 @@ typedef struct proc_stat {
unsigned stime; /* stime this process */
unsigned cutime; /* sum of children's utime */
unsigned cstime; /* sum of children's stime */
- int pid; /* process id */
+ unsigned int pid; /* process id */
+ unsigned int ppid; /* parent process id */
+ unsigned int tgid; /* thread group id */
char *name; /* name of exec'd command */
unsigned vsize; /* virtual memory size for proc */
unsigned rss; /* resident set size */
+ unsigned shm; /* shared memory size, pages */
unsigned start_time; /* start time of this process */
unsigned flags; /* the flags of the process */
unsigned uid; /* uid of the process owner */
_