Hi,
We migrated the perdition machine to a VM as well as upgrade the package from perdition-1.17
to perdition-1.18.
From Time to time, this new perdition is just dead randomly a few times a day. I finally have a time
to strace the perdition.pop3 dead of what is happening, here is the info:
accept(4, {sa_family=AF_INET, sin_port=htons(53846),
sin_addr=inet_addr("<IP Address so show>")}, [6256062840960974864]) = 5
clone(child_stack=0,
flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD,
child_tidptr=0x2b1169929a80) = 21625
close(5) = 0
fcntl(4, F_SETFL, O_RDWR) = 0
poll([{fd=4, events=POLLIN}], 1, -1) = -1 EINTR (Interrupted system
call)
--- SIGCHLD (Child exited) @ 0 (0) ---
rt_sigaction(SIGCHLD, {0x405260, [CHLD], SA_RESTORER|SA_RESTART,
0x2b1167c4f2d0}, {0x405260, [CHLD], SA_RESTORER|SA_RESTART,
0x2b1167c4f2d0}, 8) = 0
wait4(4294967295, [{WIFEXITED(s) && WEXITSTATUS(s) == 0}],
WNOHANG, NULL) = 21609
wait4(4294967295, 0x7fffcd9be574, WNOHANG, NULL) = 0
rt_sigreturn(0xffffffff) = -1 EINTR (Interrupted system
call)
poll([{fd=4, events=POLLIN}], 1, -1) = -1 EINTR (Interrupted system
call)
--- SIGCHLD (Child exited) @ 0 (0) ---
rt_sigaction(SIGCHLD, {0x405260, [CHLD], SA_RESTORER|SA_RESTART,
0x2b1167c4f2d0}, {0x405260, [CHLD], SA_RESTORER|SA_RESTART,
0x2b1167c4f2d0}, 8) = 0
wait4(4294967295, [{WIFEXITED(s) && WEXITSTATUS(s) == 0}],
WNOHANG, NULL) = 21622
wait4(4294967295, 0x7fffcd9be574, WNOHANG, NULL) = 0
rt_sigreturn(0xffffffff) = -1 EINTR (Interrupted system
call)
poll([{fd=4, events=POLLIN}], 1, -1) = 1 ([{fd=4, revents=POLLIN}])
fcntl(4, F_GETFL) = 0x2 (flags O_RDWR)
fcntl(4, F_SETFL, O_RDWR|O_NONBLOCK) = 0
accept(4, {sa_family=AF_INET, sin_port=htons(2005),
sin_addr=inet_addr("<IP Address so show>")}, [15350237863505559568]) = 5
clone(child_stack=0,
flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD,
child_tidptr=0x2b1169929a80) = 21626
close(5) = 0
fcntl(4, F_SETFL, O_RDWR) = 0
poll([{fd=4, events=POLLIN}], 1, -1) = 1 ([{fd=4, revents=POLLIN}])
fcntl(4, F_GETFL) = 0x2 (flags O_RDWR)
fcntl(4, F_SETFL, O_RDWR|O_NONBLOCK) = 0
accept(4, {sa_family=AF_INET, sin_port=htons(51774),
sin_addr=inet_addr("<IP Address so show>")}, [4524428784237019152]) = 5
clone(child_stack=0,
flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD,
child_tidptr=0x2b1169929a80) = 21627
close(5) = 0
fcntl(4, F_SETFL, O_RDWR) = 0
poll([{fd=4, events=POLLIN}], 1, -1) = 1 ([{fd=4, revents=POLLIN}])
fcntl(4, F_GETFL) = 0x2 (flags O_RDWR)
fcntl(4, F_SETFL, O_RDWR|O_NONBLOCK) = 0
accept(4, 0x7fffcd9be920, [4524428784237019264]) = -1 ECONNABORTED
(Software caused connection abort)
accept(4, 0x7fffcd9be920, [4524428784237019264]) = -1 EAGAIN (Resource
temporarily unavailable)
fcntl(4, F_SETFL, O_RDWR) = 0
time([1323309467]) = 1323309467
stat("/etc/localtime", {st_mode=S_IFREG|0644, st_size=2183, ...}) = 0
stat("/etc/localtime", {st_mode=S_IFREG|0644, st_size=2183, ...}) = 0
stat("/etc/localtime", {st_mode=S_IFREG|0644, st_size=2183, ...}) = 0
sendto(3, "<19>Dec 8 12:57:47 perdition[73"..., 86,
MSG_NOSIGNAL, NULL, 0) = -1 ENOTCONN (Transport endpoint is not
connected)
close(3) = 0
socket(PF_FILE, SOCK_DGRAM, 0) = 3
fcntl(3, F_SETFD, FD_CLOEXEC) = 0
connect(3, {sa_family=AF_FILE, path="/dev/log"...}, 110) = 0
sendto(3, "<19>Dec 8 12:57:47 perdition[73"..., 86,
MSG_NOSIGNAL, NULL, 0) =
86unlink("/var/run/perdition.pop3/perdition.pop3.pid") = 0
getrlimit(RLIMIT_NOFILE, {rlim_cur=4*1024, rlim_max=4*1024}) = 0
close(0) = 0
close(1) = 0
close(2) = 0
close(3) = 0
....
..
This is also catched similarly at different VM too.
I checked the source, it seems 1.17 is using blockIO while 1.18 use nonBlockIO. At the vanessa_socket_server.c
static pid_t
__vanessa_socket_server_accept(int *g, int listen_socket, int *listen_socketv,...
...
..
for(;;) {
addrlen = sizeof(from);
*g = accept(listen_socket, (struct sockaddr *) &from, &addrlen); if (*g < 0) {
if(errno == EINTR || errno == ECONNABORTED) {
continue; /* Ignore EINTR and ECONNABORTED */
}
if (errno == EAGAIN || errno == EWOULDBLOCK)
return -1; /* Don't log EAGAIN or EWOULDBLOCK */ VANESSA_LOGGER_DEBUG_ERRNO("accept");
return(-1);
}
....
..
So if the perdition face an ECONNABORTED, it accept() again, it will return -1 if there is no
incoming socket, after a few return, it seems that (not sure) it will out of the loop and
"goto out" and finally return to perdition to exit the program.
Is there a patch for that? or the later version fix this?
Eric