1 | #!/usr/bin/perl |
---|
2 | # |
---|
3 | # loadbalance: |
---|
4 | # This script tries to limit the load on a computer. |
---|
5 | # It is applicable to large batches of small jobs. |
---|
6 | # It runs only a few jobs at a time, with low priority, and |
---|
7 | # watches the uptime to make sure it stays below the load limit. |
---|
8 | # |
---|
9 | # Usage: loadbalance <maxloadfile> <commandfile> [options] |
---|
10 | # Options: |
---|
11 | # -logdir logdir saves the cmd,stdout,stderr files |
---|
12 | # to another logdir (default is |
---|
13 | # /tmp/loadbalance_PID |
---|
14 | # -xload Do pop up xload windows |
---|
15 | # -noxload Don't pop up xload windows (default) |
---|
16 | # |
---|
17 | # Where: <maxloadfile> is a file that lists the maximum load for |
---|
18 | # each machine. It should list each machine on a separate |
---|
19 | # line, e.g: |
---|
20 | # |
---|
21 | # radiance 6 |
---|
22 | # lambert 6 |
---|
23 | # wavelet 2 |
---|
24 | # snell 1 |
---|
25 | # |
---|
26 | # You can change this file while it's running, and it will |
---|
27 | # try to match the new numbers. It will not run any processes |
---|
28 | # on machines unlisted here. |
---|
29 | # |
---|
30 | # <commandfile> Is a list of commands to be executed, one |
---|
31 | # command per line. They should not depend on being executed |
---|
32 | # within the initial directory; e.g: |
---|
33 | # |
---|
34 | # cd /usr/data/raytrace; myrt -o f0.rgb f0.iv |
---|
35 | # cd /usr/data/raytrace; myrt -o f1.rgb f1.iv |
---|
36 | # cd /usr/data/raytrace; myrt -o f2.rgb f2.iv |
---|
37 | # cd /usr/data/raytrace; myrt -o f3.rgb f3.iv |
---|
38 | # .... |
---|
39 | # |
---|
40 | # You can use a single dash ("-") instead of commandfile, |
---|
41 | # and it will read from stdin. |
---|
42 | # |
---|
43 | # |
---|
44 | |
---|
45 | sub printUsage { |
---|
46 | print STDERR "Usage: loadbalance <maxloadfile> <commandfile> [options]\n"; |
---|
47 | print STDERR "Options:\n"; |
---|
48 | print STDERR " -logdir logdir saves the cmd,stdout,stderr files\n"; |
---|
49 | print STDERR " to another logdir (default is\n"; |
---|
50 | print STDERR " /tmp/loadbalance_PID\n"; |
---|
51 | print STDERR " -xload Do pop up xload windows\n"; |
---|
52 | print STDERR " -noxload Don't pop up xload windows (default)\n"; |
---|
53 | print STDERR "\n"; |
---|
54 | print STDERR "Where: <maxloadfile> is a file that lists the maximum load for\n"; |
---|
55 | print STDERR " each machine. An optional 3rd word lists the minimum\n"; |
---|
56 | print STDERR " number of jobs (not load!) for each machine.\n"; |
---|
57 | print STDERR " maxloadfile should list each machine on a separate line, e.g:\n"; |
---|
58 | print STDERR "\n"; |
---|
59 | print STDERR " radiance 6 3\n"; |
---|
60 | print STDERR " lambert 6 2\n"; |
---|
61 | print STDERR " wavelet 2\n"; |
---|
62 | print STDERR " snell 1\n"; |
---|
63 | print STDERR " aegean 0\n"; |
---|
64 | print STDERR " \n"; |
---|
65 | print STDERR " You can change this file while it's running, and it will\n"; |
---|
66 | print STDERR " try to match the new numbers. However, you can only change\n"; |
---|
67 | print STDERR " the numbers, NOT THE MACHINES. Please keep the same machines\n"; |
---|
68 | print STDERR " in the same order, to aid bookkeeping.\n"; |
---|
69 | print STDERR "\n"; |
---|
70 | print STDERR " <commandfile> Is a list of commands to be executed, one\n"; |
---|
71 | print STDERR " command per line. They should not depend on being executed\n"; |
---|
72 | print STDERR " within the initial directory; e.g:\n"; |
---|
73 | print STDERR "\n"; |
---|
74 | print STDERR " cd /usr/data/raytrace; myrt -o f0.rgb f0.iv\n"; |
---|
75 | print STDERR " cd /usr/data/raytrace; myrt -o f1.rgb f1.iv\n"; |
---|
76 | print STDERR " cd /usr/data/raytrace; myrt -o f2.rgb f2.iv\n"; |
---|
77 | print STDERR " cd /usr/data/raytrace; myrt -o f3.rgb f3.iv\n"; |
---|
78 | print STDERR " ....\n"; |
---|
79 | print STDERR "\n"; |
---|
80 | print STDERR " You can use a single dash (\"-\") instead of commandfile,\n"; |
---|
81 | print STDERR " and it will read from stdin.\n"; |
---|
82 | exit(-1); |
---|
83 | } |
---|
84 | |
---|
85 | # Timesteps, in seconds. This is how often it checks to see if it |
---|
86 | # should spawn a new process. |
---|
87 | $TIMEINTERVAL = 6.0; |
---|
88 | # Amount to decay PENDING each time step. |
---|
89 | # This decay compensates for uptime's 1-min lag. |
---|
90 | $STEPDECAY = 0.8; |
---|
91 | |
---|
92 | # Option defaults |
---|
93 | $DOXLOAD = 0; |
---|
94 | |
---|
95 | # Default directory for log files |
---|
96 | $AUTOMOUNTPREFIX = "/n/"; |
---|
97 | $BASEHOST = `hostname`; |
---|
98 | chop $BASEHOST; |
---|
99 | $LOGDIR = $AUTOMOUNTPREFIX.$BASEHOST."/tmp/loadbalance_$$"; |
---|
100 | |
---|
101 | # Make stderr autoflush |
---|
102 | use IO::Handle; |
---|
103 | STDOUT->autoflush(1); |
---|
104 | |
---|
105 | # Open command and maxload files |
---|
106 | if ($#ARGV < 1) { |
---|
107 | print STDERR "Wrong number of arguments...\n"; |
---|
108 | &printUsage; |
---|
109 | exit(-1); |
---|
110 | } else { |
---|
111 | # Figure out and open maxload file... |
---|
112 | $maxloadfile = $ARGV[0]; |
---|
113 | if (substr($maxloadfile,0,1) eq "-") { |
---|
114 | print STDERR "Unknown flag: $ARGV[0]...\n"; |
---|
115 | &printUsage; |
---|
116 | } else { |
---|
117 | if (!open(MAXLOAD, $maxloadfile)) { |
---|
118 | print STDERR "Unable to open maxload file $maxloadfile...\n"; |
---|
119 | &printUsage; |
---|
120 | } |
---|
121 | |
---|
122 | # Initialize some arrays for tracking load, and then |
---|
123 | # close maxload for now... we'll open it every time we want |
---|
124 | # to check it, so that it can be updated on-the-fly |
---|
125 | print STDERR "Host check: Making sure all listed hosts are reachable,\n"; |
---|
126 | print STDERR " and have proper ssh permissions...\n"; |
---|
127 | for ($n=0; ($line = <MAXLOAD>); $n++) { |
---|
128 | ($host[$n], $limit[$n]) = split(' ', $line); |
---|
129 | |
---|
130 | # First just run simple rsh, to verify host is usable. |
---|
131 | if ($limit[$n] >= 1) { |
---|
132 | # since rsh is disabled on most machines, I changed this to ssh - leslie |
---|
133 | $errstat = `ssh $host[$n] date\n`; |
---|
134 | # $errstat = `rsh $host[$n] date\n`; |
---|
135 | if ($?) { |
---|
136 | die "Error: `ssh $host[$n] date` failed. Fix ssh settings,\n". |
---|
137 | "or remove host $host[$n] from loadlimit file: $maxloadfile.\n"; |
---|
138 | # die "Error: `rsh $host[$n] date` failed. Fix rsh settings,\n". |
---|
139 | # "or remove host $host[$n] from loadlimit file: $maxloadfile.\n"; |
---|
140 | } |
---|
141 | } else { |
---|
142 | print STDERR "$host[$n]: Skipping ssh check, limit less than 1.\n"; |
---|
143 | } |
---|
144 | |
---|
145 | # ok, so ssh works. Get initial uptime info. |
---|
146 | $load[$n] = &uptime($host[$n]); |
---|
147 | $oldload[$n] = $load[$n]; |
---|
148 | $pending[$n] = 0; |
---|
149 | print STDERR "Using host $host[$n], load $load[$n], limit $limit[$n].\n"; |
---|
150 | $guessload[$n] = 0; |
---|
151 | |
---|
152 | } |
---|
153 | close(MAXLOAD); |
---|
154 | } |
---|
155 | |
---|
156 | # Figure out and open command file... |
---|
157 | $commandfile = $ARGV[1]; |
---|
158 | if ($commandfile eq '-') { |
---|
159 | open(COMMAND, "&STDIN"); |
---|
160 | $ncommands = "???"; |
---|
161 | } elsif (substr($commandfile,0,1) eq "-") { |
---|
162 | print STDERR "Unknown flag: $ARGV[1]...\n"; |
---|
163 | &printUsage; |
---|
164 | } else { |
---|
165 | ($ncommands, @rest) = split(' ', `wc -l $commandfile\n`); |
---|
166 | if (!open(COMMAND, $commandfile)) { |
---|
167 | print STDERR "Unable to open command file $commandfile...\n"; |
---|
168 | &printUsage; |
---|
169 | } |
---|
170 | } |
---|
171 | } |
---|
172 | |
---|
173 | # Parse options |
---|
174 | $currarg = 2; |
---|
175 | while ($currarg <= $#ARGV) { |
---|
176 | # Handle -logdir |
---|
177 | if ($ARGV[$currarg] eq "-logdir") { |
---|
178 | $LOGDIR = $ARGV[$currarg+1]; |
---|
179 | $currarg +=2; |
---|
180 | # check arg existed |
---|
181 | if ($LOGDIR eq "") { |
---|
182 | print STDERR "Error: no logdir???\n\n"; |
---|
183 | &printUsage(); |
---|
184 | } |
---|
185 | # add absolute path to logdir |
---|
186 | if (substr($LOGDIR, 0, 1) ne "/") { |
---|
187 | $PWD = `pwd`; chop $PWD; |
---|
188 | $LOGDIR = "$PWD/$LOGDIR"; |
---|
189 | } |
---|
190 | # Add /n/basehost, if necessary |
---|
191 | if (substr($LOGDIR, 0, length($AUTOMOUNTPREFIX)) ne |
---|
192 | $AUTOMOUNTPREFIX) { |
---|
193 | $LOGDIR = $AUTOMOUNTPREFIX.$BASEHOST.$LOGDIR; |
---|
194 | } |
---|
195 | |
---|
196 | } elsif ($ARGV[$currarg] eq "-noxload") { |
---|
197 | $DOXLOAD = 0; |
---|
198 | $currarg++; |
---|
199 | } elsif ($ARGV[$currarg] eq "-xload") { |
---|
200 | $DOXLOAD = 1; |
---|
201 | $currarg++; |
---|
202 | } else { |
---|
203 | print STDERR "Error: Unhandled arg $ARGV[$currarg].\n\n"; |
---|
204 | &printUsage(); |
---|
205 | } |
---|
206 | } |
---|
207 | |
---|
208 | # Make sure logdir is usable |
---|
209 | if (-e $LOGDIR) { |
---|
210 | if (-d $LOGDIR) { |
---|
211 | -x $LOGDIR || die "Error: logdir $LOGDIR does not have execute permissions\n"; |
---|
212 | -w $LOGDIR || die "Error: logdir $LOGDIR does not have write permissions\n"; |
---|
213 | |
---|
214 | # print STDERR "Note, loadbalance using existing logdir $LOGDIR...\n"; |
---|
215 | # Clear any old loadbalance logfiles |
---|
216 | $cmd = "cd $LOGDIR; /bin/ls | /bin/egrep '\$loadbalance_' | ". |
---|
217 | "xargs /bin/rm -f\n"; |
---|
218 | print STDERR "Clearing old log files...."; |
---|
219 | system $cmd; |
---|
220 | print STDERR "Done.\n"; |
---|
221 | |
---|
222 | } else { |
---|
223 | print STDERR "Error: loadbalance: logdir $LOGDIR exists, \n". |
---|
224 | "and is not a directory.\n"; |
---|
225 | printUsage(); |
---|
226 | } |
---|
227 | } else { |
---|
228 | # make logdir |
---|
229 | $errmsg = `mkdir $LOGDIR\n`; |
---|
230 | if ($?) { |
---|
231 | die "Error: Could not mkdir $LOGDIR\n"; |
---|
232 | } |
---|
233 | } |
---|
234 | |
---|
235 | # Pop up an xload window for fun.... :-) |
---|
236 | if ($DOXLOAD) { |
---|
237 | print STDERR "Starting xloads..."; |
---|
238 | for ($n=0; $n <= $#host; $n++) { |
---|
239 | if ($limit[$n] >= 1) { |
---|
240 | $scale = int($limit[$n]) + 1; |
---|
241 | $geomscale = ($scale+1) * 20; |
---|
242 | $cmd = "ssh $host[$n] xload -fg green -hl blue -bg black ". |
---|
243 | "-scale $scale -geom 300x$geomscale &\n"; |
---|
244 | # $cmd = "rsh $host[$n] xload -fg green -hl blue -bg black ". |
---|
245 | # "-scale $scale -geom 300x$geomscale &\n"; |
---|
246 | # print ($cmd); |
---|
247 | system($cmd); |
---|
248 | } |
---|
249 | } |
---|
250 | print STDERR "Done!\n"; |
---|
251 | } |
---|
252 | |
---|
253 | |
---|
254 | ###################################################################### |
---|
255 | ########## |
---|
256 | ########## Main Loop |
---|
257 | ########## |
---|
258 | ###################################################################### |
---|
259 | |
---|
260 | |
---|
261 | # Loop until commands are exhausted |
---|
262 | for ($cmdno=1; ($commandline = <COMMAND>); $cmdno++) { |
---|
263 | chop($commandline); |
---|
264 | # Find a host for ssh. Wait, if necessary... |
---|
265 | $host = &findHost(); |
---|
266 | $psfile = &addps($host, $cmdno); |
---|
267 | # print STDERR "Adding psfile: $psfile....\n"; |
---|
268 | |
---|
269 | # run 1 copy of the program |
---|
270 | # Note, the command must be run in the background, or else this |
---|
271 | # script will never run more than 1 at a time... :-) |
---|
272 | # |
---|
273 | # Also note, that this does not add the "npri -h 250", which |
---|
274 | # cuts down the priority of the process. (Some things, like |
---|
275 | # cd, don't work with npri). So you'll want to add that |
---|
276 | # yourself into the commmand line... :-) |
---|
277 | # $cmd="rsh $host \"npri -h 250 $commandline\" &\n"; |
---|
278 | |
---|
279 | # Redirect STDOUT, STDERR to log files |
---|
280 | open(SAVEOUT, ">&STDOUT"); |
---|
281 | open(SAVEERR, ">&STDERR"); |
---|
282 | $stdoutname = $psfile; $stdoutname =~ s/_ps_/_stdout_/; |
---|
283 | $stderrname = $psfile; $stderrname =~ s/_ps_/_stderr_/; |
---|
284 | |
---|
285 | # Set up the command... |
---|
286 | $cmdname = $psfile; $cmdname =~ s/_ps_/_cmd_/; |
---|
287 | open(CMD, ">$cmdname"); |
---|
288 | $cmdlines = $commandline; |
---|
289 | $cmdlines =~ s/;/\n/g; |
---|
290 | print CMD "$cmdlines\n"; |
---|
291 | print CMD "/bin/rm $psfile\n"; |
---|
292 | close(CMD); |
---|
293 | # $cmd = "rsh $host npri -h 250 csh $cmdname &\n"; |
---|
294 | $cmd = "ssh $host /bin/nice -20 csh -ef $cmdname &\n"; |
---|
295 | # $cmd = "rsh $host /bin/nice -20 csh -ef $cmdname &\n"; |
---|
296 | print STDERR $cmd; |
---|
297 | |
---|
298 | open(STDOUT, ">$stdoutname"); |
---|
299 | open(STDERR, ">$stderrname"); |
---|
300 | select(STDERR); $| = 1; |
---|
301 | select(STDOUT); $| = 1; |
---|
302 | |
---|
303 | # Actually run the ssh command |
---|
304 | system($cmd); |
---|
305 | |
---|
306 | #restore STDOUT, STDERR |
---|
307 | open(STDOUT, ">&SAVEOUT"); |
---|
308 | open(STDERR, ">&SAVEERR"); |
---|
309 | |
---|
310 | } |
---|
311 | |
---|
312 | # Once all the commands are started, wait for them all to |
---|
313 | # finish |
---|
314 | $nleft = &countallps(); |
---|
315 | print STDERR "loadbalance waiting for $nleft processes to finish...\n"; |
---|
316 | while ($nleft > 0) { |
---|
317 | $oleft = $nleft; |
---|
318 | $nleft = &countallps(); |
---|
319 | $nleft = int($nleft); |
---|
320 | |
---|
321 | if ($nleft != $oleft) { |
---|
322 | print STDERR "$nleft..."; |
---|
323 | } |
---|
324 | sleep 4; |
---|
325 | } |
---|
326 | print STDERR " Done!\n"; |
---|
327 | |
---|
328 | |
---|
329 | |
---|
330 | |
---|
331 | |
---|
332 | ###################################################################### |
---|
333 | ########## |
---|
334 | ########## Helper functions |
---|
335 | ########## |
---|
336 | ###################################################################### |
---|
337 | |
---|
338 | |
---|
339 | # Find a host for ssh. Wait, if necessary. |
---|
340 | sub findHost { |
---|
341 | while (1) { |
---|
342 | # Make sure we have reloaded maxload file, if necessary. |
---|
343 | if (!$midmaxloadfile) { |
---|
344 | open(MAXLOAD, $maxloadfile); |
---|
345 | for ($n=0; ($line = <MAXLOAD>); $n++) { |
---|
346 | @words = split(' ', $line); |
---|
347 | $host[$n] = $words[0]; |
---|
348 | $limit[$n] = $words[1]; |
---|
349 | $minlimit[$n] = $words[2]; |
---|
350 | } |
---|
351 | close(MAXLOAD); |
---|
352 | $midmaxloadfile = 1; |
---|
353 | } |
---|
354 | |
---|
355 | # Run through the list, checking uptime... |
---|
356 | for ($n=0; $n <= $#host; $n++) { |
---|
357 | # Skip machines with limit 0 |
---|
358 | if ($limit[$n] == 0) { |
---|
359 | next; |
---|
360 | } |
---|
361 | # Compute guessload, our guess of the what the load "should" |
---|
362 | # be.... |
---|
363 | $load[$n] = &uptime($host[$n]); |
---|
364 | $guessload = $load[$n]; |
---|
365 | if ($load[$n] > $oldload[$n]) { |
---|
366 | # Add derivative to load if rising... |
---|
367 | $load += (60.0 / $TIMEINTERVAL) * ($load[$n] - $oldload[$n]); |
---|
368 | } |
---|
369 | $oldload[$n] = $guessload; |
---|
370 | |
---|
371 | # Count number of processes still running on this host |
---|
372 | $pscount = 0+ &countps($host[$n]); |
---|
373 | |
---|
374 | # Reduce pending to pscount, if it's too big |
---|
375 | # Since the pending load (from our jobs) cannot be larger |
---|
376 | # than the total number of (our) jobs |
---|
377 | $pending[$n] = $pscount if ($pending[$n] > $pscount); |
---|
378 | |
---|
379 | if (($guessload + $pending[$n] + 1 <= $limit[$n] && |
---|
380 | $pscount +1 <= $limit[$n]) || |
---|
381 | $minlimit[$n] > $pscount) { |
---|
382 | # We found a processor to use.... |
---|
383 | print STDERR "========\n"; |
---|
384 | $icommand++; |
---|
385 | print STDERR "Using $host[$n] ($icommand of $ncommands): load: $load[$n], guess: $guessload,". |
---|
386 | " pending: $pending[$n], limit: $limit[$n], ". |
---|
387 | "pscount: $pscount ...\n"; |
---|
388 | # add 1.1, to be cautious... |
---|
389 | $pending[$n] += 1.1; |
---|
390 | return($host[$n]); |
---|
391 | } |
---|
392 | } |
---|
393 | # Print waiting message so people know it's still alive... |
---|
394 | |
---|
395 | print STDOUT "Uptimes: "; |
---|
396 | for ($n=0; $n <= $#host; $n++) { |
---|
397 | print STDOUT $host[$n]." ".$load[$n].", "; |
---|
398 | } |
---|
399 | print "\r"; |
---|
400 | |
---|
401 | # If we get here, we ran through the whole list. |
---|
402 | # wait 8 seconds, decay pending, and loop again... |
---|
403 | $midmaxloadfile = 0; |
---|
404 | sleep $TIMEINTERVAL; |
---|
405 | for ($n=0; $n <= $#pending; $n++) { |
---|
406 | $pending[$n] *= $STEPDECAY; |
---|
407 | } |
---|
408 | |
---|
409 | } |
---|
410 | } |
---|
411 | |
---|
412 | # Add a file to LOGDIR, to record that a process is running. |
---|
413 | # The process will remove it when it's done. |
---|
414 | # returns the name of the file. |
---|
415 | sub addps { |
---|
416 | $host = $_[0]; |
---|
417 | $cmdno = $_[1]; |
---|
418 | $basehost = `hostname`; |
---|
419 | chop $basehost; |
---|
420 | $psfile = "$LOGDIR/loadbalance_ps_".$host."_$cmdno"; |
---|
421 | system("touch $psfile\n"); |
---|
422 | return $psfile; |
---|
423 | } |
---|
424 | |
---|
425 | # Count the files in LOGDIR for a particular host, to know |
---|
426 | # how many processes we are running there.... |
---|
427 | sub countps { |
---|
428 | $host = $_[0]; |
---|
429 | $basehost = `hostname`; |
---|
430 | chop $basehost; |
---|
431 | $cmd = "ls $LOGDIR | grep loadbalance_ps_".$host." | wc -l"; |
---|
432 | $pscount = `$cmd`; |
---|
433 | return $pscount; |
---|
434 | } |
---|
435 | |
---|
436 | # Count the files in /LOGDIR for all hosts, to know the total number |
---|
437 | # of running processes... |
---|
438 | sub countallps { |
---|
439 | $basehost = `hostname`; |
---|
440 | chop $basehost; |
---|
441 | $cmd = "ls $LOGDIR | grep loadbalance_ps_ | wc -l"; |
---|
442 | $pscount = `$cmd`; |
---|
443 | return $pscount; |
---|
444 | } |
---|
445 | |
---|
446 | |
---|
447 | # Get the uptime on a remote system |
---|
448 | sub uptime { |
---|
449 | $host = $_[0]; |
---|
450 | # Run uptime to get the system load |
---|
451 | # Port to Linux - Changing rup to ssh <machine> uptime since it looks like rup is not |
---|
452 | # part of the default Linux install. |
---|
453 | |
---|
454 | local($loadstr) = `ssh $host uptime`; |
---|
455 | # local($loadstr) = `rup $host\n`; |
---|
456 | |
---|
457 | # Find the word after "average:" |
---|
458 | @words = split(' ', $loadstr); |
---|
459 | |
---|
460 | $upt = ""; |
---|
461 | for ($i=0; $i <= $#words; $i++) { |
---|
462 | if ($words[$i] eq "average:") { |
---|
463 | $upt = $words[$i+1]; |
---|
464 | } |
---|
465 | } |
---|
466 | if ($upt eq "") { |
---|
467 | print STDERR "WARNING! ssh $host uptime failed!\n"; |
---|
468 | return(99999999); |
---|
469 | } |
---|
470 | # Chop off comma, return uptime... |
---|
471 | chop($upt); |
---|
472 | return($upt + 0); |
---|
473 | } |
---|
474 | |
---|
475 | |
---|