[Ace-users] [ace-bugs] ACE_Process_Manager: race condition for short lived processes
Russell Mora
russell_mora at symantec.com
Fri Jan 25 15:56:37 CST 2008
ACE VERSION: 5.4.8
(Yes, I know it is ancient, but I checked SVN and the problem appears to
be still there)
HOST MACHINE and OPERATING SYSTEM:
If on Windows based OS's, which version of WINSOCK do you
use?:
All non-windows - bug originally seen on Solaris but reproduced and
tested on RHEL4 x86
TARGET MACHINE and OPERATING SYSTEM, if different from HOST:
COMPILER NAME AND VERSION (AND PATCHLEVEL):
THE $ACE_ROOT/ace/config.h FILE [if you use a link to a platform-
specific file, simply state which one]:
Not relevant
THE $ACE_ROOT/include/makeinclude/platform_macros.GNU FILE [if you
use a link to a platform-specific file, simply state which one
(unless this isn't used in this case, e.g., with Microsoft Visual
C++)]:
Various depending on OS
CONTENTS OF $ACE_ROOT/bin/MakeProjectCreator/config/default.features
(used by MPC when you generate your own makefiles):
AREA/CLASS/EXAMPLE AFFECTED:
[What example failed? What module failed to compile?]
ACE_Process_Manager
DOES THE PROBLEM AFFECT:
EXECUTION?
Race condition - see below
SYNOPSIS:
ACE_Process_Manager::spawn contains race condition when registered with
reactor
DESCRIPTION:
We had seen "oops, reaped unmanaged xxx" messages for some short lived
processes which prevented a registered exit handler from being called.
Upon investigation of the ACE_Process_Manager::spawn() method we noticed
the potential for a race condition:
pid_t pid = process->spawn (options);
// Only include the pid in the parent's table.
if (pid == ACE_INVALID_PID || pid == 0)
return pid;
ACE_MT (ACE_GUARD_RETURN (ACE_Recursive_Thread_Mutex,
ace_mon, this->lock_, -1));
if (this->append_proc (process) == -1)
// bad news: spawned, but not registered in table.
return ACE_INVALID_PID;
The problem here is the gap between when the process is spawn and when
it is added to the ACE_Process_Manager's process table. When the
ACE_Process_Manager is registered with a (running) reactor and a short
lived process is spawned, it is possible that the child process will
exit and the reactor (upon receipt of the SIGCHLD) will call
ACE_Process_Manager::wait() before this code has had a chance to add the
process information to the process table.
(This of course is not a problem on windows because on windows the
process handles are each individually registered with the reactor in the
append_proc() method).
REPEAT BY:
I modified the ACE_wrappers/tests/Process_Manager_Test.cpp to reproduce
this problem - here is the diff:
Index: tests/Process_Manager_Test.cpp
===================================================================
RCS file:
/local/cvs/master/pspACETAO/Current/source/ACE_wrappers/tests/Process_Ma
nager_Test.cpp,v
retrieving revision 1.1.1.1
diff -u -w -r1.1.1.1 Process_Manager_Test.cpp
--- tests/Process_Manager_Test.cpp 6 Dec 2005 23:30:45 -0000
1.1.1.1
+++ tests/Process_Manager_Test.cpp 25 Jan 2008 21:45:36 -0000
@@ -32,6 +32,10 @@
#include "ace/Thread.h"
#include "ace/Reactor.h"
+#if defined (ACE_MT_SAFE) && (ACE_MT_SAFE != 0) && !defined (ACE_WIN32)
+#include "ace/Task.h"
+#endif /* ACE_MT_SAFE */
+
ACE_RCSID(tests, Process_Manager_Test, "Process_Manager_Test.cpp,v 4.11
1999/09/02 04:36:30 schmidt Exp")
static u_int debug_test = 0;
@@ -74,7 +78,8 @@
static pid_t
spawn_child (const ACE_TCHAR *argv0,
ACE_Process_Manager &mgr,
- int sleep_time = 0)
+ int sleep_time = 0,
+ ACE_Process * process = 0)
{
#if defined (ACE_WIN32)
const ACE_TCHAR *cmdline_format = ACE_TEXT("\"%s\" %s %d");
@@ -90,7 +95,11 @@
debug_test ? ACE_TEXT ("-d") : ACE_TEXT (""),
sleep_time);
- pid_t result = mgr.spawn (opts);
+ pid_t result = -1;
+ if (process)
+ result = mgr.spawn (process, opts);
+ else
+ result = mgr.spawn (opts);
if (result != ACE_INVALID_PID)
ACE_DEBUG ((LM_DEBUG,
@@ -291,18 +300,86 @@
mgr.register_handler (new Exit_Handler ("specific"),
child6);
+
+#if defined (ACE_MT_SAFE) && (ACE_MT_SAFE != 0) && !defined(ACE_WIN32)
+ // Run the reactor event loop in another thread
+ class Reactor_Worker: public ACE_Task_Base
+ {
+ public:
+ virtual int svc (void)
+ {
+ ACE_Time_Value how_long (10);
+ ACE_Reactor::instance ()->owner (ACE_OS::thr_self ());
+ return ACE_Reactor::instance ()->run_reactor_event_loop
(how_long);
+ }
+ } reactor_worker;
+ if (reactor_worker.activate () != 0)
+ {
+ ACE_ERROR ((LM_ERROR,
+ ACE_TEXT ("(%P) Failed to spawn reactor thread\n")));
+ test_status = 1;
+ }
+
+ // ACE_Process::parent is called just after the child process is
+ // spawned but before the child process is added to the
+ // ACE_Process_Manager's list of managed child processes - if we
+ // sleep here we give the reactor an opportunity to cleanup the
+ // child process before it is registered.
+ class Delay_Process : public ACE_Process
+ {
+ public:
+ Delay_Process ()
+ : unmanaged_ (false)
+ {}
+ virtual void parent (pid_t)
+ {
+ // Give the reactor a chance to catch the child process exit
+ if (ACE_OS::sleep (2) != 0 && errno == EINTR)
+ {
+ // child process has exited - give the reactor a little more
time
+ ACE_OS::sleep (2);
+ }
+ }
+ virtual void unmanage (void)
+ {
+ this->unmanaged_ = true;
+ }
+ bool unmanaged_;
+ } delay_process;
+ spawn_child (argv[0],
+ mgr,
+ 0,
+ &delay_process);
+
+ reactor_worker.wait ();
+
+#else /* ACE_MT_SAFE */
ACE_Time_Value how_long (10);
ACE_Reactor::instance ()->run_reactor_event_loop (how_long);
+#endif /* ACE_MT_SAFE */
ACE_DEBUG ((LM_DEBUG,
ACE_TEXT ("(%P) Reactor loop done!\n") ));
+#if defined (ACE_MT_SAFE) && (ACE_MT_SAFE != 0) && !defined (ACE_WIN32)
+ if (delay_process.unmanaged_ == false)
+ {
+ ACE_ERROR ((LM_ERROR,
+ ACE_TEXT ("(%P) Child process not correctly
cleaned-up")
+ ACE_TEXT (" - check for 'oops' in log\n")));
+ test_status = 1;
+ }
+#endif /* ACE_MT_SAFE */
+
size_t nr_procs = mgr.managed ();
if (nr_procs != 0)
+ {
ACE_ERROR ((LM_ERROR,
ACE_TEXT ("(%P) %d processes left in manager\n"),
nr_procs));
+ test_status = 1;
+ }
ACE_END_TEST;
return test_status;
SAMPLE FIX/WORKAROUND:
I fixed this by simply moving the acquisition of the lock in
ACE_Process_Manager::spawn() to before ACE_Process::spawn() is called -
this causes the reactor thread to block in ACE_Process_Manager::wait()
until after the process information is entered into the process table.
This to me seems safe from dead-lock due to the fact that this is a
*recursive* thread mutex - please correct me if this is not the case.
Here is the diff:
Index: ace/Process_Manager.cpp
===================================================================
RCS file:
/local/cvs/master/pspACETAO/Current/source/ACE_wrappers/ace/Process_Mana
ger.cpp,v
retrieving revision 1.1.1.1
diff -u -w -r1.1.1.1 Process_Manager.cpp
--- ace/Process_Manager.cpp 6 Dec 2005 23:26:27 -0000 1.1.1.1
+++ ace/Process_Manager.cpp 25 Jan 2008 21:45:36 -0000
@@ -441,14 +441,21 @@
{
ACE_TRACE ("ACE_Process_Manager::spawn");
+#if !defined (ACE_WIN32)
+ ACE_MT (ACE_GUARD_RETURN (ACE_Recursive_Thread_Mutex,
+ ace_mon, this->lock_, -1));
+#endif /* ACE_WIN32 */
+
pid_t pid = process->spawn (options);
// Only include the pid in the parent's table.
if (pid == ACE_INVALID_PID || pid == 0)
return pid;
+#if defined (ACE_WIN32)
ACE_MT (ACE_GUARD_RETURN (ACE_Recursive_Thread_Mutex,
ace_mon, this->lock_, -1));
+#endif /* ACE_WIN32 */
if (this->append_proc (process) == -1)
// bad news: spawned, but not registered in table.
More information about the Ace-users
mailing list