This patch overhauls the way that memory resource usage is managed: --- pbs-2.3.12/src/resmom/linux/mom_mach.c | 226 ++++++++++++++++++++++----------- pbs-2.3.12/src/resmom/linux/mom_mach.h | 5 2 files changed, 158 insertions(+), 73 deletions(-) diff -puN src/resmom/linux/mom_mach.c~vmem-accounting src/resmom/linux/mom_mach.c --- pbs-2.3.12-orig/src/resmom/linux/mom_mach.c~vmem-accounting 2004-08-06 17:52:11.000000000 -0400 +++ pbs-2.3.12/src/resmom/linux/mom_mach.c 2004-08-06 17:52:11.000000000 -0400 @@ -235,8 +235,8 @@ void proc_get_btime() } static const char stat_str[] = -/* pid cmd state xx xx sessid xx xx flags xx */ -"%d (%[^)]) %c %*d %*d %d %*d %*d %u %*u " +/* pid cmd state ppid xx sessid xx xx flags xx */ +"%d (%[^)]) %c %d %*d %d %*d %*d %u %*u " /* xx xx xx utime stime cutime cstime xx xx xx xx start_time vsize rss */ "%*u %*u %*u %d %d %d %d %*d %*d %*u %*u %u %u %u"; @@ -267,10 +267,10 @@ get_proc_stat(int pid) if ((fd = fopen(path, "r")) == NULL) return(NULL); - if (fscanf(fd, stat_str, &ps.pid, path, &ps.state, + if (fscanf(fd, stat_str, &ps.pid, path, &ps.state, &ps.ppid, &ps.session, &ps.flags, &ps.utime, &ps.stime, &ps.cutime, &ps.cstime, &jiffies, &ps.vsize, - &ps.rss) != 12) { + &ps.rss) != 13) { fclose(fd); return(NULL); } @@ -280,7 +280,30 @@ get_proc_stat(int pid) } else { ps.uid = sb.st_uid; } + fclose(fd); + /* read statm for shared memory */ + ps.shm = 0; + sprintf(path, "/proc/%d/statm", pid); + fd = fopen(path, "r"); + if (fd) { + fscanf(fd, "%*d %*d %d", &ps.shm); + fclose(fd); + } + + /* read status for tgid */ + ps.tgid = 0; + sprintf(path, "/proc/%d/status", pid); + fd = fopen(path, "r"); + if (fd) { + while (fgets(path, sizeof(path), fd)) { + if (strncmp(path, "Tgid:", 5) == 0) { + sscanf(path, "Tgid:\t%d", &ps.tgid); + break; + } + } + fclose(fd); + } ps.start_time = linux_time + (jiffies / 100); ps.name = path; @@ -290,7 +313,6 @@ get_proc_stat(int pid) JTOS(ps.cutime) JTOS(ps.cstime) - fclose(fd); return(&ps); } @@ -534,57 +556,26 @@ static int overcpu_proc(pjob, limit) } /* - * Internal session memory usage function. - * - * Returns the total number of bytes of address - * space consumed by all current processes within the job. - */ -static unsigned long mem_sum(pjob) - job *pjob; -{ - char *id="mem_sum"; - struct dirent *dent; - char procname[100]; - int num, i; - unsigned long segadd; - proc_stat_t *ps; - - segadd = 0; - rewinddir(pdir); - - while ((dent = readdir(pdir)) != NULL) { - if (!isdigit(dent->d_name[0])) - continue; - - if ((ps = get_proc_stat(atoi(dent->d_name))) == NULL) { - if (errno != ENOENT) { - sprintf(log_buffer, - "%s: get_proc_stat", dent->d_name); - log_err(errno, id, log_buffer); - } - continue; - } - - if (!injob(pjob, ps->session)) - continue; - segadd += ps->vsize; - } - - return (segadd); -} - -/* * Internal session workingset size function. */ -static unsigned long resi_sum(pjob) - job *pjob; +static unsigned long +resi_sum(job *pjob) { char *id="resi_sum"; - ulong resisize; + ulong rss, resisize, max_shm; struct dirent *dent; proc_stat_t *ps; - resisize = 0; + /* must keep a table of all to cull threads */ + static struct { + unsigned int pid, ppid, tgid; + unsigned long rss, shm, vsize; + int thread; + } *pids; + static int maxpids = 0; + int i, j, numpids; + + numpids = 0; rewinddir(pdir); while ((dent = readdir(pdir)) != NULL) { if (!isdigit(dent->d_name[0])) @@ -602,9 +593,108 @@ static unsigned long resi_sum(pjob) if (!injob(pjob, ps->session)) continue; - resisize += ps->rss * pagesize; + /* + * This is an aggregated total, but for processes that are + * threads, or for processes that use shared memory (posix or + * sysv), we don't want to double-count the shared areas. + * + * Threads: all but the first thread has PF_FORKNOEXEC flag + * set, but statm shows hardly any memory in shared. This is a + * problem, hence we go to the complexity of guessing at + * thread-ness as do ps and top. + * + * Fork with shmem: also have PF_FORKNOEXEC, but shared field + * of statm will be large in each process. + * + * Fork & exec with shmem: none have PF_FORKNOEXEC, but + * significant amount of shared memory. + * + * Fork, no shmem usage: also have PF_FORKNOEXEC, but small + * shared memory usage. This code misdiagnoses these forked + * processes as threads, unfortunately, and underreports the + * memory usage. Hopefully this is not common. + * + * Another caveat: multiple independent codes which use shared + * memory will also be underreported. The different shared + * memory segments are not disambiguated and will appear as one + * large shared section. Parsing /proc/pid/maps to identify + * the shared memory segments may be the only way to fix this. + * This case also is likely rare. + * + * After filtering out the threads, we subtract out the shared + * memory component from the RSS of each process, add up that + * difference, then add back in one shared memory. This may + * undercount the case where different processes share + * different memory regions, but is good for the most general + * case of one big shared memory space for all processes in a + * job. + */ + + /* grow pids[] array over time */ + if (numpids == maxpids) { + void *x = pids; + maxpids += 20; + pids = malloc(maxpids * sizeof(*pids)); + if (!pids) { + log_err(0, id, "alloc more pids"); + exit(1); + } + if (numpids) { + memcpy(pids, x, numpids * sizeof(*pids)); + free(x); + } + } + + pids[numpids].pid = ps->pid; + pids[numpids].ppid = ps->ppid; + pids[numpids].tgid = ps->tgid; + pids[numpids].rss = ps->rss; + pids[numpids].shm = ps->shm; + pids[numpids].vsize = ps->vsize; + pids[numpids].thread = 0; + ++numpids; } + /* cull threads out of the pids */ + for (i=0; i max_shm) + max_shm = pids[i].shm; + if (pids[i].rss >= pids[i].shm) + rss = pids[i].rss - pids[i].shm; + else + rss = 0; + resisize += rss; + } + resisize += max_shm; + resisize *= pagesize; + return (resisize); } @@ -708,7 +798,6 @@ int mom_set_limits(pjob, set_mode) unsigned long value; /* place in which to build resource value */ resource *pres; struct rlimit reslim; - unsigned long mem_limit = 0; DBPRT(("%s: entered\n", id)) assert(pjob != NULL); @@ -752,12 +841,8 @@ int mom_set_limits(pjob, set_mode) if (setrlimit(RLIMIT_FSIZE, &reslim) < 0) return (error(pname, PBSE_SYSTEM)); } - } else if (strcmp(pname, "vmem") == 0) { /* check */ - retval = getsize(pres, &value); - if (retval != PBSE_NONE) - return (error(pname, retval)); - if ((mem_limit == 0) || (value < mem_limit)) - mem_limit = value; + } else if (strcmp(pname, "vmem") == 0) { /* ignore */ + /* vmem monitoring dynamic, sum over procs of rss */ } else if (strcmp(pname, "pvmem") == 0) { /* set */ if (set_mode == SET_LIMIT_SET) { retval = getsize(pres, &value); @@ -765,10 +850,13 @@ int mom_set_limits(pjob, set_mode) return (error(pname, retval)); if (value > INT_MAX) return (error(pname, PBSE_BADATVAL)); - if ((mem_limit == 0) || (value < mem_limit)) - mem_limit = value; + if (setrlimit(RLIMIT_DATA, &reslim) < 0) + return (error("RLIMIT_DATA", PBSE_SYSTEM)); + if (setrlimit(RLIMIT_STACK, &reslim) < 0) + return (error("RLIMIT_STACK", PBSE_SYSTEM)); } } else if (strcmp(pname, "mem") == 0) { /* ignore */ + /* also dynamic, sum over nodes of vmem */ } else if (strcmp(pname, "pmem") == 0) { /* set */ if (set_mode == SET_LIMIT_SET) { retval = getsize(pres, &value); @@ -794,16 +882,6 @@ int mom_set_limits(pjob, set_mode) return (error(pname, PBSE_UNKRESC)); pres = (resource *)GET_NEXT(pres->rs_link); } - if (set_mode == SET_LIMIT_SET) { - /* if either of vmem or pvmem was given, set sys limit to lesser */ - if (mem_limit != 0) { - reslim.rlim_cur = reslim.rlim_max = mem_limit; - if (setrlimit(RLIMIT_DATA, &reslim) < 0) - return (error("RLIMIT_DATA", PBSE_SYSTEM)); - if (setrlimit(RLIMIT_STACK, &reslim) < 0) - return (error("RLIMIT_STACK", PBSE_SYSTEM)); - } - } return (PBSE_NONE); } /* @@ -917,10 +995,11 @@ int mom_over_limit(pjob) return (TRUE); } } else if (strcmp(pname, "vmem") == 0) { + /* redefined to be resident mem, not vsize --pw */ retval = getsize(pres, &value); if (retval != PBSE_NONE) continue; - if ((num = mem_sum(pjob)) > value) { + if ((num = resi_sum(pjob)) > value) { sprintf(log_buffer, "vmem %lu exceeded limit %lu", num, value); @@ -968,6 +1047,7 @@ int mom_set_use(pjob) attribute *at; resource_def *rd; unsigned long *lp, lnum; + unsigned long resisum; assert(pjob != NULL); at = &pjob->ji_wattr[(int)JOB_ATR_resc_used]; @@ -1014,12 +1094,14 @@ int mom_set_use(pjob) lnum = cput_sum(pjob); *lp = MAX(*lp, lnum); + resisum = resi_sum(pjob); + rd = find_resc_def(svr_resc_def, "vmem", svr_resc_size); assert(rd != NULL); pres = find_resc_entry(at, rd); assert(pres != NULL); lp = &pres->rs_value.at_val.at_size.atsv_num; - lnum = (mem_sum(pjob) + 1023) >> 10; /* as KB */ + lnum = (resisum + 1023) >> 10; /* as KB */ *lp = MAX(*lp, lnum); rd = find_resc_def(svr_resc_def, "walltime", svr_resc_size); @@ -1033,7 +1115,7 @@ int mom_set_use(pjob) pres = find_resc_entry(at, rd); assert(pres != NULL); lp = &pres->rs_value.at_val.at_size.atsv_num; - lnum = (resi_sum(pjob) + 1023) >> 10; /* as KB */ + lnum = (resisum + 1023) >> 10; /* as KB */ *lp = MAX(*lp, lnum); return (PBSE_NONE); diff -puN src/resmom/linux/mom_mach.h~vmem-accounting src/resmom/linux/mom_mach.h --- pbs-2.3.12-orig/src/resmom/linux/mom_mach.h~vmem-accounting 2004-08-06 17:52:11.000000000 -0400 +++ pbs-2.3.12/src/resmom/linux/mom_mach.h 2004-08-06 17:52:11.000000000 -0400 @@ -122,10 +122,13 @@ typedef struct proc_stat { unsigned stime; /* stime this process */ unsigned cutime; /* sum of children's utime */ unsigned cstime; /* sum of children's stime */ - int pid; /* process id */ + unsigned int pid; /* process id */ + unsigned int ppid; /* parent process id */ + unsigned int tgid; /* thread group id */ char *name; /* name of exec'd command */ unsigned vsize; /* virtual memory size for proc */ unsigned rss; /* resident set size */ + unsigned shm; /* shared memory size, pages */ unsigned start_time; /* start time of this process */ unsigned flags; /* the flags of the process */ unsigned uid; /* uid of the process owner */ _