summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/run.c38
1 files changed, 29 insertions, 9 deletions
diff --git a/src/run.c b/src/run.c
index caf0b38..2e373ce 100644
--- a/src/run.c
+++ b/src/run.c
@@ -207,7 +207,7 @@ void local_handler(int fd, void *data)
static void do_overrun_alarm(struct alarm_block *a, void *data)
{
nl_overrun_request_resync(STATE(overrun));
- add_alarm(&STATE(overrun_alarm), 2, 0);
+ STATE(stats).nl_kernel_table_resync++;
}
static int event_handler(enum nf_conntrack_msg_type type,
@@ -378,6 +378,9 @@ init(void)
return 0;
}
+/* interval of 30s. for between two overrun */
+#define OVRUN_INT 30
+
static void __run(struct timeval *next_alarm)
{
int ret;
@@ -406,15 +409,33 @@ static void __run(struct timeval *next_alarm)
if (ret == -1) {
switch(errno) {
case ENOBUFS:
- /*
- * It seems that ctnetlink can't back off,
- * it's likely that we're losing events.
- * Solution: duplicate the socket buffer
- * size and resync with master conntrack table.
+ /* We have hit ENOBUFS, it's likely that we are
+ * losing events. Two possible situations may
+ * trigger this error:
+ *
+ * 1) The netlink receiver buffer is too small:
+ * increasing the netlink buffer size should
+ * be enough. However, some event messages
+ * got lost. We have to resync ourselves
+ * with the kernel table conntrack table to
+ * resolve the inconsistency.
+ *
+ * 2) The receiver is too slow to process the
+ * netlink messages so that the queue gets
+ * full quickly. This generally happens
+ * if the system is under heavy workload
+ * (busy CPU). In this case, increasing the
+ * size of the netlink receiver buffer
+ * would not help anymore since we would
+ * be delaying the overrun. Moreover, we
+ * should avoid resynchronizations. We
+ * should do our best here and keep
+ * replicating as much states as possible.
+ * If workload lowers at some point,
+ * we resync ourselves.
*/
nl_resize_socket_buffer(STATE(event));
- nl_overrun_request_resync(STATE(overrun));
- add_alarm(&STATE(overrun_alarm), 2, 0);
+ add_alarm(&STATE(overrun_alarm), OVRUN_INT, 0);
STATE(stats).nl_catch_event_failed++;
STATE(stats).nl_overrun++;
break;
@@ -435,7 +456,6 @@ static void __run(struct timeval *next_alarm)
}
if (FD_ISSET(nfct_fd(STATE(overrun)), &readfds)) {
- del_alarm(&STATE(overrun_alarm));
nfct_catch(STATE(overrun));
if (STATE(mode)->purge)
STATE(mode)->purge();