[Ace-users] [ace-bugs] ACE_Process_Manager: race condition for short lived processes

Fri Jan 25 15:56:37 CST 2008

ACE VERSION: 5.4.8
(Yes, I know it is ancient, but I checked SVN and the problem appears to
be still there)

    HOST MACHINE and OPERATING SYSTEM:
        If on Windows based OS's, which version of WINSOCK do you
        use?:
All non-windows - bug originally seen on Solaris but reproduced and
tested on RHEL4 x86

    TARGET MACHINE and OPERATING SYSTEM, if different from HOST:
    COMPILER NAME AND VERSION (AND PATCHLEVEL):

    THE $ACE_ROOT/ace/config.h FILE [if you use a link to a platform-
    specific file, simply state which one]:
Not relevant

    THE $ACE_ROOT/include/makeinclude/platform_macros.GNU FILE [if you
    use a link to a platform-specific file, simply state which one
    (unless this isn't used in this case, e.g., with Microsoft Visual
    C++)]:
Various depending on OS

    CONTENTS OF $ACE_ROOT/bin/MakeProjectCreator/config/default.features
    (used by MPC when you generate your own makefiles):

    AREA/CLASS/EXAMPLE AFFECTED:
[What example failed?  What module failed to compile?]
ACE_Process_Manager

    DOES THE PROBLEM AFFECT:
        EXECUTION?
Race condition - see below

    SYNOPSIS:
ACE_Process_Manager::spawn contains race condition when registered with
reactor

    DESCRIPTION:
We had seen "oops, reaped unmanaged xxx" messages for some short lived
processes which prevented a registered exit handler from being called.
Upon investigation of the ACE_Process_Manager::spawn() method we noticed
the potential for a race condition:

  pid_t pid = process->spawn (options);

  // Only include the pid in the parent's table.
  if (pid == ACE_INVALID_PID || pid == 0)
    return pid;

  ACE_MT (ACE_GUARD_RETURN (ACE_Recursive_Thread_Mutex,
                            ace_mon, this->lock_, -1));

  if (this->append_proc (process) == -1)
    // bad news: spawned, but not registered in table.
    return ACE_INVALID_PID;

The problem here is the gap between when the process is spawn and when
it is added to the ACE_Process_Manager's process table.  When the
ACE_Process_Manager is registered with a (running) reactor and a short
lived process is spawned, it is possible that the child process will
exit and the reactor (upon receipt of the SIGCHLD) will call
ACE_Process_Manager::wait() before this code has had a chance to add the
process information to the process table.

(This of course is not a problem on windows because on windows the
process handles are each individually registered with the reactor in the
append_proc() method).

    REPEAT BY:
I modified the ACE_wrappers/tests/Process_Manager_Test.cpp to reproduce
this problem - here is the diff:

Index: tests/Process_Manager_Test.cpp
===================================================================
RCS file:
/local/cvs/master/pspACETAO/Current/source/ACE_wrappers/tests/Process_Ma
nager_Test.cpp,v
retrieving revision 1.1.1.1
diff -u -w -r1.1.1.1 Process_Manager_Test.cpp

--- tests/Process_Manager_Test.cpp      6 Dec 2005 23:30:45 -0000
1.1.1.1
+++ tests/Process_Manager_Test.cpp      25 Jan 2008 21:45:36 -0000
@@ -32,6 +32,10 @@
 #include "ace/Thread.h"
 #include "ace/Reactor.h"

+#if defined (ACE_MT_SAFE) && (ACE_MT_SAFE != 0) && !defined (ACE_WIN32)
+#include "ace/Task.h"
+#endif /* ACE_MT_SAFE */
+
 ACE_RCSID(tests, Process_Manager_Test, "Process_Manager_Test.cpp,v 4.11
1999/09/02 04:36:30 schmidt Exp")

 static u_int debug_test = 0;
@@ -74,7 +78,8 @@
 static pid_t
 spawn_child (const ACE_TCHAR *argv0,
              ACE_Process_Manager &mgr,
-             int sleep_time = 0)
+             int sleep_time = 0,
+             ACE_Process * process = 0)
 {
 #if defined (ACE_WIN32)
 const ACE_TCHAR *cmdline_format = ACE_TEXT("\"%s\" %s %d");
@@ -90,7 +95,11 @@
                      debug_test ? ACE_TEXT ("-d") : ACE_TEXT (""),
                      sleep_time);

-  pid_t  result = mgr.spawn (opts);
+  pid_t  result = -1;
+  if (process)
+    result = mgr.spawn (process, opts);
+  else
+    result = mgr.spawn (opts);

   if (result != ACE_INVALID_PID)
     ACE_DEBUG ((LM_DEBUG,
@@ -291,18 +300,86 @@
   mgr.register_handler (new Exit_Handler ("specific"),
                         child6);

+
+#if defined (ACE_MT_SAFE) && (ACE_MT_SAFE != 0) && !defined(ACE_WIN32)
+  // Run the reactor event loop in another thread
+  class Reactor_Worker: public ACE_Task_Base
+  {
+  public:
+    virtual int svc (void)
+    {
+      ACE_Time_Value how_long (10);
+      ACE_Reactor::instance ()->owner (ACE_OS::thr_self ());
+      return ACE_Reactor::instance ()->run_reactor_event_loop
(how_long);
+    }
+  } reactor_worker;
+  if (reactor_worker.activate () != 0)
+    {
+      ACE_ERROR ((LM_ERROR,
+                  ACE_TEXT ("(%P) Failed to spawn reactor thread\n")));
+      test_status = 1;
+    }
+
+  // ACE_Process::parent is called just after the child process is
+  // spawned but before the child process is added to the
+  // ACE_Process_Manager's list of managed child processes - if we
+  // sleep here we give the reactor an opportunity to cleanup the
+  // child process before it is registered.
+  class Delay_Process : public ACE_Process
+  {
+  public:
+    Delay_Process ()
+      : unmanaged_ (false)
+    {}
+    virtual void parent (pid_t)
+    {
+      // Give the reactor a chance to catch the child process exit
+      if (ACE_OS::sleep (2) != 0 && errno == EINTR)
+        {
+          // child process has exited - give the reactor a little more
time
+          ACE_OS::sleep (2);
+        }
+    }
+    virtual void unmanage (void)
+    {
+      this->unmanaged_ = true;
+    }
+    bool unmanaged_;
+  } delay_process;
+  spawn_child (argv[0],
+               mgr,
+               0,
+               &delay_process);
+
+  reactor_worker.wait ();
+
+#else  /* ACE_MT_SAFE */
   ACE_Time_Value how_long (10);

   ACE_Reactor::instance ()->run_reactor_event_loop (how_long);
+#endif /* ACE_MT_SAFE */

   ACE_DEBUG ((LM_DEBUG,
               ACE_TEXT ("(%P) Reactor loop done!\n") ));

+#if defined (ACE_MT_SAFE) && (ACE_MT_SAFE != 0) && !defined (ACE_WIN32)
+  if (delay_process.unmanaged_ == false)
+    {
+      ACE_ERROR ((LM_ERROR,
+                  ACE_TEXT ("(%P) Child process not correctly
cleaned-up")
+                  ACE_TEXT (" - check for 'oops' in log\n")));
+      test_status = 1;
+    }
+#endif /* ACE_MT_SAFE */
+
   size_t nr_procs = mgr.managed ();
   if (nr_procs != 0)
+    {
     ACE_ERROR ((LM_ERROR,
                 ACE_TEXT ("(%P) %d processes left in manager\n"),
                 nr_procs));
+      test_status = 1;
+    }

   ACE_END_TEST;
   return test_status;


    SAMPLE FIX/WORKAROUND:
I fixed this by simply moving the acquisition of the lock in
ACE_Process_Manager::spawn() to before ACE_Process::spawn() is called -
this causes the reactor thread to block in ACE_Process_Manager::wait()
until after the process information is entered into the process table.
This to me seems safe from dead-lock due to the fact that this is a
*recursive* thread mutex - please correct me if this is not the case.

Here is the diff:
Index: ace/Process_Manager.cpp
===================================================================
RCS file:
/local/cvs/master/pspACETAO/Current/source/ACE_wrappers/ace/Process_Mana
ger.cpp,v
retrieving revision 1.1.1.1
diff -u -w -r1.1.1.1 Process_Manager.cpp
--- ace/Process_Manager.cpp     6 Dec 2005 23:26:27 -0000       1.1.1.1
+++ ace/Process_Manager.cpp     25 Jan 2008 21:45:36 -0000
@@ -441,14 +441,21 @@
 {
   ACE_TRACE ("ACE_Process_Manager::spawn");

+#if !defined (ACE_WIN32)
+  ACE_MT (ACE_GUARD_RETURN (ACE_Recursive_Thread_Mutex,
+                            ace_mon, this->lock_, -1));
+#endif /* ACE_WIN32 */
+
   pid_t pid = process->spawn (options);

   // Only include the pid in the parent's table.
   if (pid == ACE_INVALID_PID || pid == 0)
     return pid;

+#if defined (ACE_WIN32)
   ACE_MT (ACE_GUARD_RETURN (ACE_Recursive_Thread_Mutex,
                             ace_mon, this->lock_, -1));
+#endif /* ACE_WIN32 */

   if (this->append_proc (process) == -1)
     // bad news: spawned, but not registered in table.