diff options
| -rw-r--r-- | src/run.c | 38 | 
1 files changed, 29 insertions, 9 deletions
| @@ -207,7 +207,7 @@ void local_handler(int fd, void *data)  static void do_overrun_alarm(struct alarm_block *a, void *data)  {  	nl_overrun_request_resync(STATE(overrun)); -	add_alarm(&STATE(overrun_alarm), 2, 0); +	STATE(stats).nl_kernel_table_resync++;  }  static int event_handler(enum nf_conntrack_msg_type type, @@ -378,6 +378,9 @@ init(void)  	return 0;  } +/* interval of 30s. for between two overrun */ +#define OVRUN_INT	30 +  static void __run(struct timeval *next_alarm)  {  	int ret; @@ -406,15 +409,33 @@ static void __run(struct timeval *next_alarm)  		if (ret == -1) {  			switch(errno) {  			case ENOBUFS: -				/* -				 * It seems that ctnetlink can't back off, -				 * it's likely that we're losing events. -				 * Solution: duplicate the socket buffer -				 * size and resync with master conntrack table. +				/* We have hit ENOBUFS, it's likely that we are +				 * losing events. Two possible situations may +				 * trigger this error: +				 * +				 * 1) The netlink receiver buffer is too small: +				 *    increasing the netlink buffer size should +				 *    be enough. However, some event messages +				 *    got lost. We have to resync ourselves +				 *    with the kernel table conntrack table to +				 *    resolve the inconsistency.  +				 * +				 * 2) The receiver is too slow to process the +				 *    netlink messages so that the queue gets +				 *    full quickly. This generally happens +				 *    if the system is under heavy workload +				 *    (busy CPU). In this case, increasing the +				 *    size of the netlink receiver buffer +				 *    would not help anymore since we would +				 *    be delaying the overrun. Moreover, we +				 *    should avoid resynchronizations. We  +				 *    should do our best here and keep +				 *    replicating as much states as possible. +				 *    If workload lowers at some point, +				 *    we resync ourselves.  				 */  				nl_resize_socket_buffer(STATE(event)); -				nl_overrun_request_resync(STATE(overrun)); -				add_alarm(&STATE(overrun_alarm), 2, 0); +				add_alarm(&STATE(overrun_alarm), OVRUN_INT, 0);  				STATE(stats).nl_catch_event_failed++;  				STATE(stats).nl_overrun++;  				break; @@ -435,7 +456,6 @@ static void __run(struct timeval *next_alarm)  	}  	if (FD_ISSET(nfct_fd(STATE(overrun)), &readfds)) { -		del_alarm(&STATE(overrun_alarm));  		nfct_catch(STATE(overrun));  		if (STATE(mode)->purge)  			STATE(mode)->purge(); | 
