source: proiecte/pmake3d/make3d_original/Make3dSingleImageStanford_version0.1/third_party/vrippack-0.31/src/pvrip/loadbalance @ 37

Last change on this file since 37 was 37, checked in by (none), 14 years ago

Added original make3d

  • Property svn:executable set to *
File size: 14.6 KB
Line 
1#!/usr/bin/perl
2#
3# loadbalance:
4# This script tries to limit the load on a computer.
5# It is applicable to large batches of small jobs.
6# It runs only a few jobs at a time, with low priority, and
7# watches the uptime to make sure it stays below the load limit.
8#
9# Usage:  loadbalance <maxloadfile> <commandfile> [options]
10# Options:
11#          -logdir logdir   saves the cmd,stdout,stderr files
12#                           to another logdir (default is
13#                           /tmp/loadbalance_PID
14#          -xload           Do pop up xload windows
15#          -noxload         Don't pop up xload windows (default)
16#
17# Where:   <maxloadfile> is a file that lists the maximum load for
18#          each machine.  It should list each machine on a separate
19#          line, e.g:
20#
21#              radiance 6
22#              lambert 6
23#              wavelet 2
24#              snell 1
25#         
26#          You can change this file while it's running, and it will
27#          try to match the new numbers.  It will not run any processes
28#          on machines unlisted here.
29#
30#          <commandfile>  Is a list of commands to be executed, one
31#          command per line.  They should not depend on being executed
32#          within the initial directory; e.g:
33#
34#              cd /usr/data/raytrace; myrt -o f0.rgb f0.iv
35#              cd /usr/data/raytrace; myrt -o f1.rgb f1.iv
36#              cd /usr/data/raytrace; myrt -o f2.rgb f2.iv
37#              cd /usr/data/raytrace; myrt -o f3.rgb f3.iv
38#              ....
39#
40#          You can use a single dash ("-") instead of commandfile,
41#          and it will read from stdin.
42#
43#
44
45sub printUsage {
46    print STDERR "Usage:  loadbalance <maxloadfile> <commandfile> [options]\n";
47    print STDERR "Options:\n";
48    print STDERR "         -logdir logdir   saves the cmd,stdout,stderr files\n";
49    print STDERR "                          to another logdir (default is\n";
50    print STDERR "                          /tmp/loadbalance_PID\n";
51    print STDERR "         -xload           Do pop up xload windows\n";
52    print STDERR "         -noxload         Don't pop up xload windows (default)\n";
53    print STDERR "\n";
54    print STDERR "Where:   <maxloadfile> is a file that lists the maximum load for\n";
55    print STDERR "         each machine.  An optional 3rd word lists the minimum\n";
56    print STDERR "         number of jobs (not load!) for each machine.\n";
57    print STDERR "         maxloadfile should list each machine on a separate line, e.g:\n";
58    print STDERR "\n";
59    print STDERR "             radiance 6 3\n";
60    print STDERR "             lambert 6 2\n";
61    print STDERR "             wavelet 2\n";
62    print STDERR "             snell 1\n";
63    print STDERR "             aegean 0\n";
64    print STDERR "         \n";
65    print STDERR "         You can change this file while it's running, and it will\n";
66    print STDERR "         try to match the new numbers.  However, you can only change\n";
67    print STDERR "         the numbers, NOT THE MACHINES.  Please keep the same machines\n";
68    print STDERR "         in the same order, to aid bookkeeping.\n";
69    print STDERR "\n";
70    print STDERR "         <commandfile>  Is a list of commands to be executed, one\n";
71    print STDERR "         command per line.  They should not depend on being executed\n";
72    print STDERR "         within the initial directory; e.g:\n";
73    print STDERR "\n";
74    print STDERR "             cd /usr/data/raytrace; myrt -o f0.rgb f0.iv\n";
75    print STDERR "             cd /usr/data/raytrace; myrt -o f1.rgb f1.iv\n";
76    print STDERR "             cd /usr/data/raytrace; myrt -o f2.rgb f2.iv\n";
77    print STDERR "             cd /usr/data/raytrace; myrt -o f3.rgb f3.iv\n";
78    print STDERR "             ....\n";
79    print STDERR "\n";
80    print STDERR "         You can use a single dash (\"-\") instead of commandfile,\n";
81    print STDERR "         and it will read from stdin.\n";
82    exit(-1);
83}
84
85# Timesteps, in seconds. This is how often it checks to see if it
86# should spawn a new process.
87$TIMEINTERVAL = 6.0;
88# Amount to decay PENDING each time step.
89# This decay compensates for uptime's 1-min lag.
90$STEPDECAY = 0.8;
91
92# Option defaults
93$DOXLOAD = 0;
94
95# Default directory for log files
96$AUTOMOUNTPREFIX = "/n/";
97$BASEHOST = `hostname`;
98chop $BASEHOST;
99$LOGDIR = $AUTOMOUNTPREFIX.$BASEHOST."/tmp/loadbalance_$$";
100
101# Make stderr autoflush
102use IO::Handle;
103STDOUT->autoflush(1);
104
105# Open command and maxload files
106if ($#ARGV < 1) {
107    print STDERR "Wrong number of arguments...\n";
108    &printUsage;
109    exit(-1);
110} else {
111    # Figure out and open maxload file...
112    $maxloadfile = $ARGV[0];
113    if (substr($maxloadfile,0,1) eq "-") {
114        print STDERR "Unknown flag: $ARGV[0]...\n";
115        &printUsage;
116    } else {
117        if (!open(MAXLOAD, $maxloadfile)) {
118            print  STDERR "Unable to open maxload file $maxloadfile...\n";
119            &printUsage;
120        }
121
122        # Initialize some arrays for tracking load, and then
123        # close maxload for now... we'll open it every time we want
124        # to check it, so that it can be updated on-the-fly
125        print STDERR "Host check:  Making sure all listed hosts are reachable,\n";
126        print STDERR "             and have proper ssh permissions...\n";
127        for ($n=0; ($line = <MAXLOAD>); $n++) {
128            ($host[$n], $limit[$n]) = split(' ', $line);
129
130            # First just run simple rsh, to verify host is usable.
131            if ($limit[$n] >= 1) {
132                # since rsh is disabled on most machines, I changed this to ssh - leslie
133                $errstat = `ssh $host[$n] date\n`;
134                # $errstat = `rsh $host[$n] date\n`;
135                if ($?) {
136                    die "Error: `ssh $host[$n] date` failed.  Fix ssh settings,\n".
137                        "or remove host $host[$n] from loadlimit file: $maxloadfile.\n";
138                    # die "Error: `rsh $host[$n] date` failed.  Fix rsh settings,\n".
139                        # "or remove host $host[$n] from loadlimit file: $maxloadfile.\n";
140                }
141            } else {
142                print STDERR "$host[$n]: Skipping ssh check, limit less than 1.\n";
143            }
144
145            # ok, so ssh works. Get initial uptime info.
146            $load[$n] = &uptime($host[$n]);
147            $oldload[$n] = $load[$n];
148            $pending[$n] = 0;
149            print STDERR "Using host $host[$n], load $load[$n], limit $limit[$n].\n";
150            $guessload[$n] = 0;
151           
152        }
153        close(MAXLOAD);
154    }
155   
156    # Figure out and open command file...
157    $commandfile = $ARGV[1];
158    if ($commandfile eq '-') {
159        open(COMMAND, "&STDIN");
160        $ncommands = "???";
161    } elsif (substr($commandfile,0,1) eq "-") {
162        print STDERR "Unknown flag: $ARGV[1]...\n";
163        &printUsage;
164    } else {
165        ($ncommands, @rest) = split(' ', `wc -l $commandfile\n`);
166        if (!open(COMMAND, $commandfile)) {
167            print  STDERR "Unable to open command file $commandfile...\n";
168            &printUsage;
169        }
170    }
171}
172
173# Parse options
174$currarg = 2;
175while ($currarg <= $#ARGV) {
176    # Handle -logdir
177    if ($ARGV[$currarg] eq "-logdir") {
178        $LOGDIR = $ARGV[$currarg+1];
179        $currarg +=2;
180        # check arg existed
181        if ($LOGDIR eq "") {
182            print STDERR "Error: no logdir???\n\n";
183            &printUsage();
184        }
185        # add absolute path to logdir
186        if (substr($LOGDIR, 0, 1) ne "/") {
187            $PWD = `pwd`; chop $PWD;
188            $LOGDIR = "$PWD/$LOGDIR";
189        }
190        # Add /n/basehost, if necessary
191        if (substr($LOGDIR, 0, length($AUTOMOUNTPREFIX)) ne
192            $AUTOMOUNTPREFIX) {
193            $LOGDIR = $AUTOMOUNTPREFIX.$BASEHOST.$LOGDIR;
194        }
195       
196    } elsif ($ARGV[$currarg] eq "-noxload") {
197        $DOXLOAD = 0;
198        $currarg++;
199    } elsif ($ARGV[$currarg] eq "-xload") {
200        $DOXLOAD = 1;
201        $currarg++;
202    } else {
203        print STDERR "Error: Unhandled arg $ARGV[$currarg].\n\n";
204        &printUsage();
205    }
206}
207
208# Make sure logdir is usable
209if (-e $LOGDIR) {
210    if (-d $LOGDIR) {
211        -x $LOGDIR || die "Error: logdir $LOGDIR does not have execute permissions\n";
212        -w $LOGDIR || die "Error: logdir $LOGDIR does not have write permissions\n";
213
214        # print STDERR "Note, loadbalance using existing logdir $LOGDIR...\n";
215        # Clear any old loadbalance logfiles
216        $cmd = "cd $LOGDIR; /bin/ls | /bin/egrep '\$loadbalance_' | ".
217            "xargs /bin/rm -f\n";
218        print STDERR "Clearing old log files....";
219        system $cmd;
220        print STDERR "Done.\n";
221
222    } else {
223        print STDERR "Error: loadbalance: logdir $LOGDIR exists, \n".
224            "and is not a directory.\n";
225        printUsage();
226    }
227} else {
228    # make logdir
229    $errmsg = `mkdir $LOGDIR\n`;
230    if ($?) {
231        die "Error: Could not mkdir $LOGDIR\n";
232    }
233}
234
235# Pop up an xload window for fun.... :-)
236if ($DOXLOAD) {
237    print STDERR "Starting xloads...";
238    for ($n=0; $n <= $#host; $n++) {
239        if ($limit[$n] >= 1) {
240            $scale = int($limit[$n]) + 1;
241            $geomscale = ($scale+1) * 20;
242            $cmd = "ssh $host[$n] xload -fg green -hl blue -bg black ".
243                "-scale $scale -geom 300x$geomscale &\n";
244            # $cmd = "rsh $host[$n] xload -fg green -hl blue -bg black ".
245                # "-scale $scale -geom 300x$geomscale &\n";
246            # print ($cmd);
247            system($cmd);
248        }
249    }
250    print STDERR "Done!\n";
251}
252
253
254######################################################################
255##########
256##########      Main Loop
257##########
258######################################################################
259
260
261# Loop until commands are exhausted
262for ($cmdno=1; ($commandline = <COMMAND>); $cmdno++) {
263    chop($commandline);
264    # Find a host for ssh.  Wait, if necessary...
265    $host = &findHost();
266    $psfile = &addps($host, $cmdno);
267    # print STDERR "Adding psfile: $psfile....\n";
268       
269    # run 1 copy of the program
270    # Note, the command must be run in the background, or else this
271    # script will never run more than 1 at a time... :-)
272    #
273    # Also note, that this does not add the "npri -h 250", which
274    # cuts down the priority of the process.  (Some things, like
275    # cd, don't work with npri).  So you'll want to add that
276    # yourself into the commmand line... :-)
277    # $cmd="rsh $host \"npri -h 250 $commandline\" &\n";
278
279    # Redirect STDOUT, STDERR to log files
280    open(SAVEOUT, ">&STDOUT");
281    open(SAVEERR, ">&STDERR");
282    $stdoutname = $psfile; $stdoutname =~ s/_ps_/_stdout_/;
283    $stderrname = $psfile; $stderrname =~ s/_ps_/_stderr_/;
284   
285    # Set up the command...
286    $cmdname = $psfile; $cmdname =~ s/_ps_/_cmd_/;
287    open(CMD, ">$cmdname");
288    $cmdlines = $commandline;
289    $cmdlines =~ s/;/\n/g;
290    print CMD "$cmdlines\n";
291    print CMD "/bin/rm $psfile\n";
292    close(CMD);
293#    $cmd = "rsh $host npri -h 250 csh $cmdname &\n";
294    $cmd = "ssh $host /bin/nice -20 csh -ef $cmdname &\n";
295    # $cmd = "rsh $host /bin/nice -20 csh -ef $cmdname &\n";
296    print STDERR $cmd;
297
298    open(STDOUT, ">$stdoutname");
299    open(STDERR, ">$stderrname");
300    select(STDERR); $| = 1;
301    select(STDOUT); $| = 1;
302
303    # Actually run the ssh command
304    system($cmd);
305
306    #restore STDOUT, STDERR
307    open(STDOUT, ">&SAVEOUT");
308    open(STDERR, ">&SAVEERR");
309
310}
311
312# Once all the commands are started, wait for them all to
313# finish
314$nleft = &countallps();
315print STDERR "loadbalance waiting for $nleft processes to finish...\n";
316while ($nleft > 0) {
317    $oleft = $nleft;
318    $nleft = &countallps();
319    $nleft = int($nleft);
320   
321    if ($nleft != $oleft) {
322        print STDERR "$nleft...";
323    }
324    sleep 4;
325}
326print STDERR " Done!\n";
327
328
329
330
331
332######################################################################
333##########
334##########      Helper functions
335##########
336######################################################################
337
338
339# Find a host for ssh.  Wait, if necessary.
340sub findHost {
341    while (1) {
342        # Make sure we have reloaded maxload file, if necessary.
343        if (!$midmaxloadfile) {
344            open(MAXLOAD, $maxloadfile);
345            for ($n=0; ($line = <MAXLOAD>); $n++) {
346                @words = split(' ', $line);
347                $host[$n] = $words[0];
348                $limit[$n] = $words[1];
349                $minlimit[$n] = $words[2];
350            }
351            close(MAXLOAD);
352            $midmaxloadfile = 1;
353        }
354
355        # Run through the list, checking uptime...
356        for ($n=0; $n <= $#host; $n++) {
357            # Skip machines with limit 0
358            if ($limit[$n] == 0) {
359                next;
360            }
361            # Compute guessload, our guess of the what the load "should"
362            # be....
363            $load[$n] = &uptime($host[$n]);
364            $guessload = $load[$n];
365            if ($load[$n] > $oldload[$n]) {
366                # Add derivative to load if rising...
367                $load += (60.0 / $TIMEINTERVAL) * ($load[$n] - $oldload[$n]);
368            }
369            $oldload[$n] = $guessload;
370
371            # Count number of processes still running on this host
372            $pscount = 0+ &countps($host[$n]);
373           
374            # Reduce pending to pscount, if it's too big
375            # Since the pending load (from our jobs) cannot be larger
376            # than the total number of (our) jobs
377            $pending[$n] = $pscount if ($pending[$n] > $pscount);
378
379            if (($guessload + $pending[$n] + 1 <= $limit[$n] &&
380                $pscount +1 <= $limit[$n]) ||
381                $minlimit[$n] > $pscount) {
382                # We found a processor to use....
383                print STDERR "========\n";
384                $icommand++;
385                print STDERR "Using $host[$n] ($icommand of $ncommands): load: $load[$n], guess: $guessload,".
386                    " pending: $pending[$n], limit: $limit[$n], ".
387                        "pscount: $pscount ...\n";
388                # add 1.1, to be cautious...
389                $pending[$n] += 1.1;
390                return($host[$n]);
391            } 
392        }
393        # Print waiting message so people know it's still alive...
394       
395        print STDOUT "Uptimes: ";
396        for ($n=0; $n <= $#host; $n++) {
397            print STDOUT $host[$n]." ".$load[$n].", ";
398        }
399        print "\r";
400
401        # If we get here, we ran through the whole list.
402        # wait 8 seconds, decay pending, and loop again...
403        $midmaxloadfile = 0;
404        sleep $TIMEINTERVAL;
405        for ($n=0; $n <= $#pending; $n++) {
406            $pending[$n] *= $STEPDECAY;
407        }
408       
409    }
410}
411
412# Add a file to LOGDIR, to record that a process is running.
413# The process will remove it when it's done.
414# returns the name of the file.
415sub addps {
416    $host = $_[0];
417    $cmdno = $_[1];
418    $basehost = `hostname`;
419    chop $basehost;
420    $psfile = "$LOGDIR/loadbalance_ps_".$host."_$cmdno";
421    system("touch $psfile\n");
422    return $psfile;
423}
424
425# Count the files in LOGDIR for a particular host, to know
426# how many processes we are running there....
427sub countps {
428    $host = $_[0];
429    $basehost = `hostname`;
430    chop $basehost;
431    $cmd = "ls $LOGDIR | grep loadbalance_ps_".$host." | wc -l";
432    $pscount = `$cmd`;
433    return $pscount;
434}
435
436# Count the files in /LOGDIR for all hosts, to know the total number
437# of running processes...
438sub countallps {
439    $basehost = `hostname`;
440    chop $basehost;
441    $cmd = "ls $LOGDIR | grep loadbalance_ps_ | wc -l";
442    $pscount = `$cmd`;
443    return $pscount;
444}
445
446
447# Get the uptime on a remote system
448sub uptime {
449    $host = $_[0];
450    # Run uptime to get the system load
451    # Port to Linux - Changing rup to ssh <machine> uptime since it looks like rup is not
452    # part of the default Linux install.
453
454    local($loadstr) = `ssh $host uptime`;
455    # local($loadstr) = `rup $host\n`;
456
457    # Find the word after "average:"
458    @words = split(' ', $loadstr);
459
460    $upt = "";
461    for ($i=0; $i <= $#words; $i++) {
462        if ($words[$i] eq "average:") {
463            $upt = $words[$i+1];
464        }
465    }
466    if ($upt eq "") {
467        print STDERR "WARNING!  ssh $host uptime failed!\n";
468        return(99999999);
469    }
470    # Chop off comma, return uptime...
471    chop($upt);
472    return($upt + 0);
473}
474
475
Note: See TracBrowser for help on using the repository browser.