Hi,

We migrated the perdition machine to a VM as well as upgrade the package from perdition-1.17
to perdition-1.18.

From Time to time, this new perdition is just dead randomly a few times a day.  I finally have a time
to strace the perdition.pop3 dead of what is happening, here is the info:

accept(4, {sa_family=AF_INET, sin_port=htons(53846), sin_addr=inet_addr("<IP Address so show>")}, [6256062840960974864]) = 5
clone(child_stack=0, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0x2b1169929a80) = 21625
close(5)                                = 0
fcntl(4, F_SETFL, O_RDWR)               = 0
poll([{fd=4, events=POLLIN}], 1, -1)    = -1 EINTR (Interrupted system call)
--- SIGCHLD (Child exited) @ 0 (0) ---
rt_sigaction(SIGCHLD, {0x405260, [CHLD], SA_RESTORER|SA_RESTART, 0x2b1167c4f2d0}, {0x405260, [CHLD], SA_RESTORER|SA_RESTART, 0x2b1167c4f2d0}, 8) = 0
wait4(4294967295, [{WIFEXITED(s) && WEXITSTATUS(s) == 0}], WNOHANG, NULL) = 21609
wait4(4294967295, 0x7fffcd9be574, WNOHANG, NULL) = 0
rt_sigreturn(0xffffffff)                = -1 EINTR (Interrupted system call)
poll([{fd=4, events=POLLIN}], 1, -1)    = -1 EINTR (Interrupted system call)
--- SIGCHLD (Child exited) @ 0 (0) ---
rt_sigaction(SIGCHLD, {0x405260, [CHLD], SA_RESTORER|SA_RESTART, 0x2b1167c4f2d0}, {0x405260, [CHLD], SA_RESTORER|SA_RESTART, 0x2b1167c4f2d0}, 8) = 0
wait4(4294967295, [{WIFEXITED(s) && WEXITSTATUS(s) == 0}], WNOHANG, NULL) = 21622
wait4(4294967295, 0x7fffcd9be574, WNOHANG, NULL) = 0
rt_sigreturn(0xffffffff)                = -1 EINTR (Interrupted system call)
poll([{fd=4, events=POLLIN}], 1, -1)    = 1 ([{fd=4, revents=POLLIN}])
fcntl(4, F_GETFL)                       = 0x2 (flags O_RDWR)
fcntl(4, F_SETFL, O_RDWR|O_NONBLOCK)    = 0
accept(4, {sa_family=AF_INET, sin_port=htons(2005), sin_addr=inet_addr("<
IP Address so show>")}, [15350237863505559568]) = 5
clone(child_stack=0, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0x2b1169929a80) = 21626
close(5)                                = 0
fcntl(4, F_SETFL, O_RDWR)               = 0
poll([{fd=4, events=POLLIN}], 1, -1)    = 1 ([{fd=4, revents=POLLIN}])
fcntl(4, F_GETFL)                       = 0x2 (flags O_RDWR)
fcntl(4, F_SETFL, O_RDWR|O_NONBLOCK)    = 0
accept(4, {sa_family=AF_INET, sin_port=htons(51774), sin_addr=inet_addr("<
IP Address so show>")}, [4524428784237019152]) = 5
clone(child_stack=0, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0x2b1169929a80) = 21627
close(5)                                = 0
fcntl(4, F_SETFL, O_RDWR)               = 0
poll([{fd=4, events=POLLIN}], 1, -1)    = 1 ([{fd=4, revents=POLLIN}])
fcntl(4, F_GETFL)                       = 0x2 (flags O_RDWR)
fcntl(4, F_SETFL, O_RDWR|O_NONBLOCK)    = 0
accept(4, 0x7fffcd9be920, [4524428784237019264]) = -1 ECONNABORTED (Software caused connection abort)
accept(4, 0x7fffcd9be920, [4524428784237019264]) = -1 EAGAIN (Resource temporarily unavailable)

fcntl(4, F_SETFL, O_RDWR)               = 0
time([1323309467])                      = 1323309467
stat("/etc/localtime", {st_mode=S_IFREG|0644, st_size=2183, ...}) = 0
stat("/etc/localtime", {st_mode=S_IFREG|0644, st_size=2183, ...}) = 0
stat("/etc/localtime", {st_mode=S_IFREG|0644, st_size=2183, ...}) = 0
sendto(3, "<19>Dec  8 12:57:47 perdition[73"..., 86, MSG_NOSIGNAL, NULL, 0) = -1 ENOTCONN (Transport endpoint is not connected)
close(3)                                = 0
socket(PF_FILE, SOCK_DGRAM, 0)          = 3
fcntl(3, F_SETFD, FD_CLOEXEC)           = 0
connect(3, {sa_family=AF_FILE, path="/dev/log"...}, 110) = 0
sendto(3, "<19>Dec  8 12:57:47 perdition[73"..., 86, MSG_NOSIGNAL, NULL, 0) = 86unlink("/var/run/perdition.pop3/perdition.pop3.pid") = 0
getrlimit(RLIMIT_NOFILE, {rlim_cur=4*1024, rlim_max=4*1024}) = 0
close(0)                                = 0
close(1)                                = 0
close(2)                                = 0
close(3)                                = 0

....
..

This is also catched similarly at different VM too.

I checked the source, it seems 1.17 is using blockIO while 1.18 use nonBlockIO.  At the vanessa_socket_server.c

static pid_t
__vanessa_socket_server_accept(int *g, int listen_socket, int *listen_socketv,...
...
..

        for(;;) {
                addrlen = sizeof(from);
                *g = accept(listen_socket, (struct sockaddr *) &from, &addrlen);                if (*g  < 0) {
                        if(errno == EINTR || errno == ECONNABORTED) {
                                continue; /* Ignore EINTR  and ECONNABORTED */
                        }
                        if (errno == EAGAIN || errno == EWOULDBLOCK)
                                return -1; /* Don't log EAGAIN or EWOULDBLOCK */                        VANESSA_LOGGER_DEBUG_ERRNO("accept");
                        return(-1);
                }

....
..

So if the perdition face an ECONNABORTED, it accept() again, it will return -1 if there is no
incoming socket, after a few return, it seems that (not sure) it will out of the loop and
"goto out" and finally return to perdition to exit the program.

Is there a patch for that? or the later version fix this?

Eric