[Beowulf] jobs won't start with PBS scheduler

Tomas Pevny tomas.pevny at binghamton.edu
Thu Feb 2 05:54:09 PST 2006


Hi,
I have a peculiar problem with PBS scheduler. I have several queues on the 
server -- High, MatlabQueue and medium.
MatlabQueue has a limit on the number of running jobs set to 50, since I have 
only 50 licenses of matlab. Other two queues has the limit set to 80, which 
is the number of processors / cores on the cluster.
My problem is that the job's from other queues than MatlabQueu do not start. 
Jobs from the MatlabQueue are schedulled to start immediatly when other job 
finish.
So there is 30 free processors and server do not start any job from other 
queus. When I run 
qstat -f on one of the queued jobs, ir shows this 
(important line is probably this one:
 comment = Not Running: Draining system to allow starving job to run
)

Job Id: 1750.master.bw01.binghamton.edu
    Job_Name = gapsOutguess-91-100-1
    Job_Owner = tomik at master.bw01.binghamton.edu
    job_state = Q
    queue = medium
    server = master.bw01.binghamton.edu
    Checkpoint = u
    ctime = Tue Jan 31 13:06:45 2006
    Error_Path = 
master.bw01.binghamton.edu:/home/tomik/submitScripts/gapsOutgu
        ess-91-100-1.e1750
    Hold_Types = n
    Join_Path = oe
    Keep_Files = n
    Mail_Points = a
    mtime = Tue Jan 31 13:06:45 2006
    Output_Path = 
master.bw01.binghamton.edu:/home/tomik/submitScripts/output/g
        apsOutguess-91-100-1.out
    Priority = 0
    qtime = Tue Jan 31 13:06:45 2006
    Rerunable = False
    Resource_List.neednodes = 1:ppn=1
    Resource_List.nodect = 1
    Resource_List.nodes = 1:ppn=1
    substate = 10
    Variable_List = PBS_O_HOME=/home/tomik,PBS_O_LANG=en_US.UTF-8,
        PBS_O_LOGNAME=tomik,
        PBS_O_PATH=/opt/maui/bin:/opt/torque/bin:/opt/bin:/opt/hdfview/bin:/op       
t/hdf/bin:/opt/ncarg/bin:/opt/mpich/p4-pathscale/bin:/opt/mpiexec/x86_6
       4/bin:/usr/kerberos/bin:/opt/java/jdk1.5.0/bin:/opt/gm/sbin:/opt/gm/bin
        :/usr/lib64/ccache/bin:/usr/local/bin:/bin:/usr/bin:/usr/X11R6/bin:/opt
        /java/jdk1.5.0/jre/bin:/opt/pathscale/bin:/home/tomik/bin,
        PBS_O_MAIL=/var/spool/mail/tomik,PBS_O_SHELL=/bin/bash,
        PBS_O_HOST=master.bw01.binghamton.edu,
      PBS_O_WORKDIR=/home/tomik/submitScripts,QF=91,PERCENT=100,START_INDEX=1,
        STEP=1,PBS_O_QUEUE=medium
    euser = tomik
    egroup = users
    queue_rank = 1657
    queue_type = E
    comment = Not Running: Draining system to allow starving job to run
    etime = Tue Jan 31 13:06:45 2006

I have also run pbsnodes -a and I see that there all nodes are in the free 
state so there are suposed to accept new jobs.
I do not know what to do, I have searched the internet and I did not find 
anything useful. 
The only thing that recommends the company that made the cluster is to restart 
the PBS scheduller. Since this is not the first time this has happened and 
situation occured immediatly I have reboot the master node, I would like to 
found the real cause and not just some temporary solution.

This is how the PBS scheduller is set up:
qmgr -c "print server"
# Create queues and set their attributes.
#
#
# Create and define queue MatlabQueue
#
create queue MatlabQueue
set queue MatlabQueue queue_type = Execution
set queue MatlabQueue Priority = 75
set queue MatlabQueue max_running = 50
set queue MatlabQueue max_user_run = 200
set queue MatlabQueue enabled = True
set queue MatlabQueue started = True
#
# Create and define queue default
#
create queue default
set queue default queue_type = Execution
set queue default Priority = 75
set queue default max_running = 100
set queue default max_user_run = 200
set queue default enabled = True
set queue default started = True
#
# Create and define queue high
#
create queue high
set queue high queue_type = Execution
set queue high Priority = 100
set queue high max_running = 100
set queue high max_user_run = 80
set queue high enabled = True
set queue high started = True
#
# Create and define queue medium
#
create queue medium
set queue medium queue_type = Execution
set queue medium Priority = 75
set queue medium max_running = 100
set queue medium max_user_run = 200
set queue medium enabled = True
set queue medium started = True
#
# Set server attributes.
#
set server scheduling = True
set server max_running = 80
set server max_user_run = 80
set server acl_host_enable = True
set server acl_hosts = master.bw01.binghamton.edu
set server acl_hosts += *.bw01.binghamton.edu
set server acl_hosts += localhost.localdomain
set server managers = mpiadmin at master.bw01.binghamton.edu
set server managers += mpiadmin at localhost.localdomain
set server managers += root at master.bw01.binghamton.edu
set server managers += root at localhost.localdomain
set server default_queue = MatlabQueue
set server log_events = 127
set server mail_from = pbsadmin
set server query_other_jobs = True
set server resources_default.neednodes = 1
set server resources_default.nodect = 1
set server resources_default.nodes = 1
set server scheduler_iteration = 150
set server node_ping_rate = 150
set server node_check_rate = 300
set server tcp_timeout = 6
set server comment = Torque Server @ master.bw01.binghamton.edu
set server node_pack = False
set server job_stat_rate = 30

Thanks for any help and suggestions.
Tomas Pevny



More information about the Beowulf mailing list