Hi,
We migrated the perdition machine to a VM as well as upgrade the package
from perdition-1.17
to perdition-1.18.
From Time to time, this new perdition is just dead
randomly a few times a
day. I finally have a time
to strace the perdition.pop3 dead of what is happening, here is the info:
accept(4, {sa_family=AF_INET, sin_port=htons(53846),
sin_addr=inet_addr("<IP Address so show>")}, [6256062840960974864]) = 5
clone(child_stack=0, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD,
child_tidptr=0x2b1169929a80) = 21625
close(5) = 0
fcntl(4, F_SETFL, O_RDWR) = 0
poll([{fd=4, events=POLLIN}], 1, -1) = -1 EINTR (Interrupted system call)
--- SIGCHLD (Child exited) @ 0 (0) ---
rt_sigaction(SIGCHLD, {0x405260, [CHLD], SA_RESTORER|SA_RESTART,
0x2b1167c4f2d0}, {0x405260, [CHLD], SA_RESTORER|SA_RESTART,
0x2b1167c4f2d0}, 8) = 0
wait4(4294967295, [{WIFEXITED(s) && WEXITSTATUS(s) == 0}], WNOHANG, NULL) =
21609
wait4(4294967295, 0x7fffcd9be574, WNOHANG, NULL) = 0
rt_sigreturn(0xffffffff) = -1 EINTR (Interrupted system call)
poll([{fd=4, events=POLLIN}], 1, -1) = -1 EINTR (Interrupted system call)
--- SIGCHLD (Child exited) @ 0 (0) ---
rt_sigaction(SIGCHLD, {0x405260, [CHLD], SA_RESTORER|SA_RESTART,
0x2b1167c4f2d0}, {0x405260, [CHLD], SA_RESTORER|SA_RESTART,
0x2b1167c4f2d0}, 8) = 0
wait4(4294967295, [{WIFEXITED(s) && WEXITSTATUS(s) == 0}], WNOHANG, NULL) =
21622
wait4(4294967295, 0x7fffcd9be574, WNOHANG, NULL) = 0
rt_sigreturn(0xffffffff) = -1 EINTR (Interrupted system call)
poll([{fd=4, events=POLLIN}], 1, -1) = 1 ([{fd=4, revents=POLLIN}])
fcntl(4, F_GETFL) = 0x2 (flags O_RDWR)
fcntl(4, F_SETFL, O_RDWR|O_NONBLOCK) = 0
accept(4, {sa_family=AF_INET, sin_port=htons(2005), sin_addr=inet_addr("<IP
Address so show>")}, [15350237863505559568]) = 5
clone(child_stack=0, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD,
child_tidptr=0x2b1169929a80) = 21626
close(5) = 0
fcntl(4, F_SETFL, O_RDWR) = 0
poll([{fd=4, events=POLLIN}], 1, -1) = 1 ([{fd=4, revents=POLLIN}])
fcntl(4, F_GETFL) = 0x2 (flags O_RDWR)
fcntl(4, F_SETFL, O_RDWR|O_NONBLOCK) = 0
accept(4, {sa_family=AF_INET, sin_port=htons(51774), sin_addr=inet_addr("<IP
Address so show>")}, [4524428784237019152]) = 5
clone(child_stack=0, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD,
child_tidptr=0x2b1169929a80) = 21627
close(5) = 0
fcntl(4, F_SETFL, O_RDWR) = 0
poll([{fd=4, events=POLLIN}], 1, -1) = 1 ([{fd=4, revents=POLLIN}])
fcntl(4, F_GETFL) = 0x2 (flags O_RDWR)
fcntl(4, F_SETFL, O_RDWR|O_NONBLOCK) = 0
*accept(4, 0x7fffcd9be920, [4524428784237019264]) = -1 ECONNABORTED
(Software caused connection abort)
accept(4, 0x7fffcd9be920, [4524428784237019264]) = -1 EAGAIN (Resource
temporarily unavailable)*
fcntl(4, F_SETFL, O_RDWR) = 0
time([1323309467]) = 1323309467
stat("/etc/localtime", {st_mode=S_IFREG|0644, st_size=2183, ...}) = 0
stat("/etc/localtime", {st_mode=S_IFREG|0644, st_size=2183, ...}) = 0
stat("/etc/localtime", {st_mode=S_IFREG|0644, st_size=2183, ...}) = 0
sendto(3, "<19>Dec 8 12:57:47 perdition[73"..., 86, MSG_NOSIGNAL, NULL,
0)
= -1 ENOTCONN (Transport endpoint is not connected)
close(3) = 0
socket(PF_FILE, SOCK_DGRAM, 0) = 3
fcntl(3, F_SETFD, FD_CLOEXEC) = 0
connect(3, {sa_family=AF_FILE, path="/dev/log"...}, 110) = 0
sendto(3, "<19>Dec 8 12:57:47 perdition[73"..., 86, MSG_NOSIGNAL, NULL,
0)
= 86unlink("/var/run/perdition.pop3/perdition.pop3.pid") = 0
getrlimit(RLIMIT_NOFILE, {rlim_cur=4*1024, rlim_max=4*1024}) = 0
close(0) = 0
close(1) = 0
close(2) = 0
close(3) = 0
....
..
This is also catched similarly at different VM too.
I checked the source, it seems 1.17 is using blockIO while 1.18 use
nonBlockIO. At the vanessa_socket_server.c
static pid_t
__vanessa_socket_server_accept(int *g, int listen_socket, int
*listen_socketv,...
...
..
for(;;) {
addrlen = sizeof(from);
*g = accept(listen_socket, (struct sockaddr *) &from,
&addrlen); if (*g < 0) {
if(errno == EINTR || errno == ECONNABORTED) {
continue; /* Ignore EINTR and ECONNABORTED
*/
}
if (errno == EAGAIN || errno == EWOULDBLOCK)
return -1; /* Don't log EAGAIN or
EWOULDBLOCK */ VANESSA_LOGGER_DEBUG_ERRNO("accept");
return(-1);
}
....
..
So if the perdition face an ECONNABORTED, it accept() again, it will return
-1 if there is no
incoming socket, after a few return, it seems that (not sure) it will out
of the loop and
"goto out" and finally return to perdition to exit the program.
Is there a patch for that? or the later version fix this?
Hi Eric,
thanks for the bug report and sorry for not responding earlier.
I am not aware of a fix (or other report) for this problem.
My feeling is that it should be resolved by the caller (direct or indirect)
__vanessa_socket_server_accept could just ignore EAGAIN and EWOULDBLOCK
(as it ignores EINTR and ECONNABORTED) if (opt & O_NONBLOCK) is false.
I am thinking of something like the following.
diff -r 86186acdf27e libvanessa_socket/vanessa_socket_server.c
--- a/libvanessa_socket/vanessa_socket_server.c Thu May 19 13:04:05 2011 +0900
+++ b/libvanessa_socket/vanessa_socket_server.c Fri Dec 30 14:08:16 2011 +0900
@@ -368,12 +368,12 @@
addrlen = sizeof(from);
*g = accept(listen_socket, (struct sockaddr *) &from, &addrlen);
if (*g < 0) {
- if(errno == EINTR || errno == ECONNABORTED) {
- continue; /* Ignore EINTR and ECONNABORTED */
- }
if (opt & O_NONBLOCK &&
(errno == EAGAIN || errno == EWOULDBLOCK))
return -1; /* Don't log EAGAIN or EWOULDBLOCK */
+ if (errno == EINTR || errno == ECONNABORTED ||
+ errno == EAGAIN || errno == EWOULDBLOCK)
+ continue; /* Ignore */
VANESSA_LOGGER_DEBUG_ERRNO("accept");
return(-1);
}