某Team在用Python开发一些代码,涉及子进程以及设法消除僵尸进程的需求。实践中他们碰上Python程序非预期退出的现象。最初他们决定用GDB调试Python解释器,查看exit()的源头。我听了之后,觉得这个问题应该用别的调试思路。帮他们排查这次程序故障时,除去原始问题,还衍生了其他问题。
这次的问题相比西安研发中心曾经碰上的Python信号处理问题,有不少基础知识、先验知识是共用的,此处不做再普及,感兴趣的同学可以翻看我以前发过的文章。
下文是一次具体的调试、分析记录。为了简化现场、方便调试,已将原始问题、衍生问题浓缩成DebugPythonWithGDB_6.py、DebugPythonWithGDB_7.py。
$ vi DebugPythonWithGDB_6.py
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 |
#!/usr/bin/env python # -*- encoding: utf-8 -*- ? import sys, os, signal, subprocess, shlex, traceback ? def on_SIGCHLD ( signum, frame ) : ????print "[on_SIGCHLD" ????sys.stdout.write( "signum??= %u\n" % signum ) ????traceback.print_stack( frame ) ????print os.waitpid( -1, os.WNOHANG ) ????""" ????try : ????????print os.waitpid( -1, os.WNOHANG ) ????except OSError : ????????sys.stdout.write( 'Line[%u]: OSError\n' % sys.exc_info()[2].tb_lineno ) ????""" ????print "on_SIGCHLD]" ? def do_more ( count ) : ????print '[do_more() begin %u]' % count ????os.system( r'printf "Child?? = %u\n" " />;/bin/sleep 1' ) """ # # 这里存在竞争条件,可以增加触发OSError异常的概率 # os.system( r'printf "Child = %u\n" ????os.system( r'printf "Child?? = %u\n" " />;/bin/sleep 1' ) os.system( r'printf "Child = %u\n" ????os.system( r'printf "Child?? = %u\n" " />;/bin/sleep 1' ) """ print '[do_more() end %u]' % count def main ( prog, args ) : if 0 == len( args ) : print 'Usage: %s ' % prog else : sys.stdout.write( "Parent = %u\n" % os.getpid() ) # # 本例中,即使有下列代码,Ctrl-C仍然无效。 # signal.signal( signal.SIGINT, signal.SIG_DFL ) # # signal.signal( signal.SIGCHLD, signal.SIG_IGN ) # signal.signal( signal.SIGCHLD, on_SIGCHLD ) # count = 0 while True : # # 本例中父进程只是一个调度框架,不需要与子进程进行通信,因此不 # 需要特别处理"stdin=None, stdout=None, stderr=None"。 # child = subprocess.Popen \ ( # # 不要直接用args[0].split(),它在处理单、双引号时不是我们 # 期望的行为。考虑这种例子,ls -l "/tmp/non exist" # shlex.split( args[0] ), # # all file descriptors except 0, 1 and 2 will be closed # before the child process is executed # close_fds = True, cwd = "/tmp" ) sys.stdout.write( "Child = %u\n" % child.pid ) # # child.send_signal( signal.SIGTERM ) # child.terminate() # child.kill() # # child.wait() # do_more( count ) count += 1 if '__main__' == __name__ : try : main( os.path.basename( sys.argv[0] ), sys.argv[1:] ) except KeyboardInterrupt : pass |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 |
$ python DebugPythonWithGDB_6.py 'python -c "import time;time.sleep(3600)"' Parent = 10244 Child = 10245 [do_more() begin 0] [on_SIGCHLD signum = 17 File "DebugPythonWithGDB_6.py", line 81, in main( os.path.basename( sys.argv[0] ), sys.argv[1:] ) File "DebugPythonWithGDB_6.py", line 76, in main do_more( count ) File "DebugPythonWithGDB_6.py", line 20, in do_more print '[do_more() begin %u]' % count (10245, 9) on_SIGCHLD] Child = 10246 [on_SIGCHLD signum = 17 File "DebugPythonWithGDB_6.py", line 81, in main( os.path.basename( sys.argv[0] ), sys.argv[1:] ) File "DebugPythonWithGDB_6.py", line 76, in main do_more( count ) File "DebugPythonWithGDB_6.py", line 21, in do_more |