src/cache-ct.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358

/*
 * (C) 2006-2011 by Pablo Neira Ayuso <pablo@netfilter.org>
 * (C) 2011 by Vyatta Inc. <http://www.vyatta.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

#include "cache.h"
#include "hash.h"
#include "log.h"
#include "conntrackd.h"
#include "netlink.h"
#include "event.h"
#include "jhash.h"
#include "network.h"

#include <errno.h>
#include <string.h>
#include <time.h>
#include <libnetfilter_conntrack/libnetfilter_conntrack.h>

static uint32_t
cache_hash4_ct(const struct nf_conntrack *ct, const struct hashtable *table)
{
	uint32_t a[4] = {
		[0]	= nfct_get_attr_u32(ct, ATTR_IPV4_SRC),
		[1]	= nfct_get_attr_u32(ct, ATTR_IPV4_DST),
		[2]	= nfct_get_attr_u8(ct, ATTR_L3PROTO) << 16 |
			  nfct_get_attr_u8(ct, ATTR_L4PROTO),
		[3]	= nfct_get_attr_u16(ct, ATTR_PORT_SRC) << 16 |
			  nfct_get_attr_u16(ct, ATTR_PORT_DST),
	};

	/*
	 * Instead of returning hash % table->hashsize (implying a divide)
	 * we return the high 32 bits of the (hash * table->hashsize) that will
	 * give results between [0 and hashsize-1] and same hash distribution,
	 * but using a multiply, less expensive than a divide. See:
	 * http://www.mail-archive.com/netdev@vger.kernel.org/msg56623.html
	 */
	return ((uint64_t)jhash2(a, 4, 0) * table->hashsize) >> 32;
}

static uint32_t
cache_hash6_ct(const struct nf_conntrack *ct, const struct hashtable *table)
{
	uint32_t a[10];

	memcpy(&a[0], nfct_get_attr(ct, ATTR_IPV6_SRC), sizeof(uint32_t)*4);
	memcpy(&a[4], nfct_get_attr(ct, ATTR_IPV6_SRC), sizeof(uint32_t)*4);
	a[8] = nfct_get_attr_u8(ct, ATTR_ORIG_L3PROTO) << 16 |
	       nfct_get_attr_u8(ct, ATTR_ORIG_L4PROTO);
	a[9] = nfct_get_attr_u16(ct, ATTR_ORIG_PORT_SRC) << 16 |
	       nfct_get_attr_u16(ct, ATTR_ORIG_PORT_DST);

	return ((uint64_t)jhash2(a, 10, 0) * table->hashsize) >> 32;
}

static uint32_t
cache_ct_hash(const void *data, const struct hashtable *table)
{
	int ret = 0;
	const struct nf_conntrack *ct = data;

	switch(nfct_get_attr_u8(ct, ATTR_L3PROTO)) {
		case AF_INET:
			ret = cache_hash4_ct(ct, table);
			break;
		case AF_INET6:
			ret = cache_hash6_ct(ct, table);
			break;
		default:
			dlog(LOG_ERR, "unknown layer 3 proto in hash");
			break;
	}
	return ret;
}

static int cache_ct_cmp(const void *data1, const void *data2)
{
	const struct cache_object *obj = data1;
	const struct nf_conntrack *ct = data2;

	return nfct_cmp(obj->ptr, ct, NFCT_CMP_ORIG) &&
	       nfct_get_attr_u32(obj->ptr, ATTR_ID) ==
	       nfct_get_attr_u32(ct, ATTR_ID);
}

static void *cache_ct_alloc(void)
{
	return nfct_new();
}

static void cache_ct_free(void *ptr)
{
	nfct_destroy(ptr);
}

static void cache_ct_copy(void *dst, void *src, unsigned int flags)
{
	nfct_copy(dst, src, flags);
}

static int cache_ct_dump_step(void *data1, void *n)
{
	char buf[1024];
	int size;
	struct __dump_container *container = data1;
	struct cache_object *obj = n;
	char *data = obj->data;
	unsigned i;

	/*
	 * XXX: Do not dump the entries that are scheduled to expire.
	 * 	These entries talk about already destroyed connections
	 * 	that we keep for some time just in case that we have to
	 * 	resent some lost messages. We do not show them to the
	 * 	user as he may think that the firewall replicas are not
	 * 	in sync. The branch below is a hack as it is quite
	 * 	specific and it breaks conntrackd modularity. Probably
	 * 	there's a nicer way to do this but until I come up with it...
	 */
	if (CONFIG(flags) & CTD_SYNC_FTFW && obj->status == C_OBJ_DEAD)
		return 0;

	/* do not show cached timeout, this may confuse users */
	if (nfct_attr_is_set(obj->ptr, ATTR_TIMEOUT))
		nfct_attr_unset(obj->ptr, ATTR_TIMEOUT);

	memset(buf, 0, sizeof(buf));
	size = nfct_snprintf(buf, 
			     sizeof(buf), 
			     obj->ptr,
			     NFCT_T_UNKNOWN, 
			     container->type,
			     0);

	for (i = 0; i < obj->cache->num_features; i++) {
		if (obj->cache->features[i]->dump) {
			size += obj->cache->features[i]->dump(obj, 
							      data, 
							      buf+size,
							      container->type);
			data += obj->cache->features[i]->size;
		}
	}
	if (container->type != NFCT_O_XML) {
		long tm = time(NULL);
		size += sprintf(buf+size, " [active since %lds]",
				tm - obj->lifetime);
	}
	size += sprintf(buf+size, "\n");
	if (send(container->fd, buf, size, 0) == -1) {
		if (errno != EPIPE)
			return -1;
	}

	return 0;
}

static void
cache_ct_commit_step(struct __commit_container *tmp, struct cache_object *obj)
{
	int ret, retry = 1, timeout;
	struct nf_conntrack *ct = obj->ptr;

	if (CONFIG(commit_timeout)) {
		timeout = CONFIG(commit_timeout);
	} else {
		timeout = time(NULL) - obj->lastupdate;
		if (timeout < 0) {
			/* XXX: Arbitrarily set the timer to one minute, how
			 * can this happen? For example, an adjustment due to
			 * daylight-saving. Probably other situations can
			 * trigger this. */
			timeout = 60;
		}
		/* calculate an estimation of the current timeout */
		timeout = nfct_get_attr_u32(ct, ATTR_TIMEOUT) - timeout;
		if (timeout < 0) {
			timeout = 60;
		}
	}

retry:
	if (nl_create_conntrack(tmp->h, ct, timeout) == -1) {
		if (errno == EEXIST && retry == 1) {
			ret = nl_destroy_conntrack(tmp->h, ct);
			if (ret == 0 || (ret == -1 && errno == ENOENT)) {
				if (retry) {
					retry = 0;
					goto retry;
				}
			}
			dlog(LOG_ERR, "commit-destroy: %s", strerror(errno));
			dlog_ct(STATE(log), ct, NFCT_O_PLAIN);
			tmp->c->stats.commit_fail++;
		} else {
			dlog(LOG_ERR, "commit-create: %s", strerror(errno));
			dlog_ct(STATE(log), ct, NFCT_O_PLAIN);
			tmp->c->stats.commit_fail++;
		}
	} else {
		tmp->c->stats.commit_ok++;
	}
}

static int cache_ct_commit_related(void *data, void *n)
{
	struct cache_object *obj = n;

	if (ct_is_related(obj->ptr))
		cache_ct_commit_step(data, obj);

	/* keep iterating even if we have found errors */
	return 0;
}

static int cache_ct_commit_master(void *data, void *n)
{
	struct cache_object *obj = n;

	if (ct_is_related(obj->ptr))
		return 0;

	cache_ct_commit_step(data, obj);
	return 0;
}

static int cache_ct_commit(struct cache *c, struct nfct_handle *h, int clientfd)
{
	unsigned int commit_ok, commit_fail;
	struct __commit_container tmp = {
		.h = h,
		.c = c,
	};
	struct timeval commit_stop, res;

	/* we already have one commit in progress, skip this. The clientfd
	 * descriptor has to be closed by the caller. */
	if (clientfd && STATE_SYNC(commit).clientfd != -1)
		return 0;

	switch(STATE_SYNC(commit).state) {
	case COMMIT_STATE_INACTIVE:
		gettimeofday(&STATE_SYNC(commit).stats.start, NULL);
		STATE_SYNC(commit).stats.ok = c->stats.commit_ok;
		STATE_SYNC(commit).stats.fail = c->stats.commit_fail;
		STATE_SYNC(commit).clientfd = clientfd;
	case COMMIT_STATE_MASTER:
		STATE_SYNC(commit).current =
			hashtable_iterate_limit(c->h, &tmp,
						STATE_SYNC(commit).current,
						CONFIG(general).commit_steps,
						cache_ct_commit_master);
		if (STATE_SYNC(commit).current < CONFIG(hashsize)) {
			STATE_SYNC(commit).state = COMMIT_STATE_MASTER;
			/* give it another step as soon as possible */
			write_evfd(STATE_SYNC(commit).evfd);
			return 1;
		}
		STATE_SYNC(commit).current = 0;
		STATE_SYNC(commit).state = COMMIT_STATE_RELATED;
	case COMMIT_STATE_RELATED:
		STATE_SYNC(commit).current =
			hashtable_iterate_limit(c->h, &tmp,
						STATE_SYNC(commit).current,
						CONFIG(general).commit_steps,
						cache_ct_commit_related);
		if (STATE_SYNC(commit).current < CONFIG(hashsize)) {
			STATE_SYNC(commit).state = COMMIT_STATE_RELATED;
			/* give it another step as soon as possible */
			write_evfd(STATE_SYNC(commit).evfd);
			return 1;
		}
		/* calculate the time that commit has taken */
		gettimeofday(&commit_stop, NULL);
		timersub(&commit_stop, &STATE_SYNC(commit).stats.start, &res);

		/* calculate new entries committed */
		commit_ok = c->stats.commit_ok - STATE_SYNC(commit).stats.ok;
		commit_fail = 
			c->stats.commit_fail - STATE_SYNC(commit).stats.fail;

		/* log results */
		dlog(LOG_NOTICE, "Committed %u new entries", commit_ok);

		if (commit_fail)
			dlog(LOG_NOTICE, "%u entries can't be "
					 "committed", commit_fail);

		dlog(LOG_NOTICE, "commit has taken %lu.%06lu seconds", 
				res.tv_sec, res.tv_usec);

		/* prepare the state machine for new commits */
		STATE_SYNC(commit).current = 0;
		STATE_SYNC(commit).state = COMMIT_STATE_INACTIVE;

		/* Close the client socket now that we're done. */
		close(STATE_SYNC(commit).clientfd);
		STATE_SYNC(commit).clientfd = -1;
	}
	return 1;
}

static struct nethdr *
cache_ct_build_msg(const struct cache_object *obj, int type)
{
	return BUILD_NETMSG_FROM_CT(obj->ptr, type);
}

/* template to cache conntracks coming from the kernel. */
struct cache_ops cache_sync_internal_ct_ops = {
	.hash		= cache_ct_hash,
	.cmp		= cache_ct_cmp,
	.alloc		= cache_ct_alloc,
	.free		= cache_ct_free,
	.copy		= cache_ct_copy,
	.dump_step	= cache_ct_dump_step,
	.commit		= NULL,
	.build_msg	= cache_ct_build_msg,
};

/* template to cache conntracks coming from the network. */
struct cache_ops cache_sync_external_ct_ops = {
	.hash		= cache_ct_hash,
	.cmp		= cache_ct_cmp,
	.alloc		= cache_ct_alloc,
	.free		= cache_ct_free,
	.copy		= cache_ct_copy,
	.dump_step	= cache_ct_dump_step,
	.commit		= cache_ct_commit,
	.build_msg	= NULL,
};

/* template to cache conntracks for the statistics mode. */
struct cache_ops cache_stats_ct_ops = {
	.hash		= cache_ct_hash,
	.cmp		= cache_ct_cmp,
	.alloc		= cache_ct_alloc,
	.free		= cache_ct_free,
	.copy		= cache_ct_copy,
	.dump_step	= cache_ct_dump_step,
	.commit		= NULL,
	.build_msg	= NULL,
};