src/cache-exp.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308

/*
 * (C) 2006-2011 by Pablo Neira Ayuso <pablo@netfilter.org>
 * (C) 2011 by Vyatta Inc. <http://www.vyatta.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

#include "cache.h"
#include "hash.h"
#include "log.h"
#include "conntrackd.h"
#include "netlink.h"
#include "event.h"
#include "jhash.h"
#include "network.h"

#include <errno.h>
#include <string.h>
#include <time.h>
#include <libnetfilter_conntrack/libnetfilter_conntrack.h>

static uint32_t
cache_hash4_exp(const struct nf_conntrack *ct, const struct hashtable *table)
{
	uint32_t a[4] = {
		[0]	= nfct_get_attr_u32(ct, ATTR_IPV4_SRC),
		[1]	= nfct_get_attr_u32(ct, ATTR_IPV4_DST),
		[2]	= nfct_get_attr_u8(ct, ATTR_L3PROTO) << 16 |
			  nfct_get_attr_u8(ct, ATTR_L4PROTO),
		[3]	= nfct_get_attr_u16(ct, ATTR_PORT_SRC) << 16 |
			  nfct_get_attr_u16(ct, ATTR_PORT_DST),
	};

	/*
	 * Instead of returning hash % table->hashsize (implying a divide)
	 * we return the high 32 bits of the (hash * table->hashsize) that will
	 * give results between [0 and hashsize-1] and same hash distribution,
	 * but using a multiply, less expensive than a divide. See:
	 * http://www.mail-archive.com/netdev@vger.kernel.org/msg56623.html
	 */
	return ((uint64_t)jhash2(a, 4, 0) * table->hashsize) >> 32;
}

static uint32_t
cache_hash6_exp(const struct nf_conntrack *ct, const struct hashtable *table)
{
	uint32_t a[10];

	memcpy(&a[0], nfct_get_attr(ct, ATTR_IPV6_SRC), sizeof(uint32_t)*4);
	memcpy(&a[4], nfct_get_attr(ct, ATTR_IPV6_SRC), sizeof(uint32_t)*4);
	a[8] = nfct_get_attr_u8(ct, ATTR_ORIG_L3PROTO) << 16 |
	       nfct_get_attr_u8(ct, ATTR_ORIG_L4PROTO);
	a[9] = nfct_get_attr_u16(ct, ATTR_ORIG_PORT_SRC) << 16 |
	       nfct_get_attr_u16(ct, ATTR_ORIG_PORT_DST);

	return ((uint64_t)jhash2(a, 10, 0) * table->hashsize) >> 32;
}

static uint32_t
cache_exp_hash(const void *data, const struct hashtable *table)
{
	int ret = 0;
	const struct nf_expect *exp = data;
	const struct nf_conntrack *ct = nfexp_get_attr(exp, ATTR_EXP_MASTER);

	switch(nfct_get_attr_u8(ct, ATTR_L3PROTO)) {
		case AF_INET:
			ret = cache_hash4_exp(ct, table);
			break;
		case AF_INET6:
			ret = cache_hash6_exp(ct, table);
			break;
		default:
			dlog(LOG_ERR, "unknown layer 3 proto in hash");
			break;
	}
	return ret;
}

static int cache_exp_cmp(const void *data1, const void *data2)
{
	const struct cache_object *obj = data1;
	const struct nf_expect *exp = data2;

	return nfexp_cmp(obj->ptr, exp, 0);
}

static void *cache_exp_alloc(void)
{
	return nfexp_new();
}

static void cache_exp_free(void *ptr)
{
	nfexp_destroy(ptr);
}

static void cache_exp_copy(void *dst, void *src, unsigned int flags)
{
	/* XXX: add nfexp_copy(...) to libnetfilter_conntrack. */
	memcpy(dst, src, nfexp_maxsize());
}

static int cache_exp_dump_step(void *data1, void *n)
{
	char buf[1024];
	int size;
	struct __dump_container *container = data1;
	struct cache_object *obj = n;
	char *data = obj->data;
	unsigned i;

	/*
	 * XXX: Do not dump the entries that are scheduled to expire.
	 *	These entries talk about already destroyed connections
	 *	that we keep for some time just in case that we have to
	 *	resent some lost messages. We do not show them to the
	 *	user as he may think that the firewall replicas are not
	 *	in sync. The branch below is a hack as it is quite
	 *	specific and it breaks conntrackd modularity. Probably
	 *	there's a nicer way to do this but until I come up with it...
	 */
	if (CONFIG(flags) & CTD_SYNC_FTFW && obj->status == C_OBJ_DEAD)
		return 0;

	/* do not show cached timeout, this may confuse users */
	if (nfexp_attr_is_set(obj->ptr, ATTR_EXP_TIMEOUT))
		nfexp_attr_unset(obj->ptr, ATTR_EXP_TIMEOUT);

	memset(buf, 0, sizeof(buf));
	size = nfexp_snprintf(buf, sizeof(buf),obj->ptr,
			      NFCT_T_UNKNOWN, container->type, 0);

	for (i = 0; i < obj->cache->num_features; i++) {
		if (obj->cache->features[i]->dump) {
			size += obj->cache->features[i]->dump(obj, data,
							      buf+size,
							      container->type);
			data += obj->cache->features[i]->size;
		}
	}
	if (container->type != NFCT_O_XML) {
		long tm = time(NULL);
		size += sprintf(buf+size, " [active since %lds]",
				tm - obj->lifetime);
	}
	size += sprintf(buf+size, "\n");
	if (send(container->fd, buf, size, 0) == -1) {
		if (errno != EPIPE)
			return -1;
	}

	return 0;
}

static int cache_exp_commit_step(void *data, void *n)
{
	struct cache_object *obj = n;
	struct __commit_container *tmp = data;
	int ret, retry = 1, timeout;
	struct nf_expect *exp = obj->ptr;

	if (CONFIG(commit_timeout)) {
		timeout = CONFIG(commit_timeout);
	} else {
		timeout = time(NULL) - obj->lastupdate;
		if (timeout < 0) {
			/* XXX: Arbitrarily set the timer to one minute, how
			 * can this happen? For example, an adjustment due to
			 * daylight-saving. Probably other situations can
			 * trigger this. */
			timeout = 60;
		}
		/* calculate an estimation of the current timeout */
		timeout = nfexp_get_attr_u32(exp, ATTR_EXP_TIMEOUT) - timeout;
		if (timeout < 0) {
			timeout = 60;
		}
	}

retry:
	if (nl_create_expect(tmp->h, exp, timeout) == -1) {
		if (errno == EEXIST && retry == 1) {
			ret = nl_destroy_expect(tmp->h, exp);
			if (ret == 0 || (ret == -1 && errno == ENOENT)) {
				if (retry) {
					retry = 0;
					goto retry;
				}
			}
			dlog(LOG_ERR, "commit-destroy: %s", strerror(errno));
			dlog_exp(STATE(log), exp, NFCT_O_PLAIN);
			tmp->c->stats.commit_fail++;
		} else {
			dlog(LOG_ERR, "commit-create: %s", strerror(errno));
			dlog_exp(STATE(log), exp, NFCT_O_PLAIN);
			tmp->c->stats.commit_fail++;
		}
	} else {
		tmp->c->stats.commit_ok++;
	}
	/* keep iterating even if we have found errors */
	return 0;
}

static int
cache_exp_commit(struct cache *c, struct nfct_handle *h, int clientfd)
{
	unsigned int commit_ok, commit_fail;
	struct timeval commit_stop, res;
	struct __commit_container tmp = {
		.h = h,
		.c = c,
	};

	/* we already have one commit in progress, skip this. The clientfd
	 * descriptor has to be closed by the caller. */
	if (clientfd && STATE_SYNC(commit).clientfd != -1)
		return -1;

	switch(STATE_SYNC(commit).state) {
	case COMMIT_STATE_INACTIVE:
		gettimeofday(&STATE_SYNC(commit).stats.start, NULL);
		STATE_SYNC(commit).stats.ok = c->stats.commit_ok;
		STATE_SYNC(commit).stats.fail = c->stats.commit_fail;
		STATE_SYNC(commit).clientfd = clientfd;
	case COMMIT_STATE_MASTER:
		STATE_SYNC(commit).current =
			hashtable_iterate_limit(c->h, &tmp,
						STATE_SYNC(commit).current,
						CONFIG(general).commit_steps,
						cache_exp_commit_step);
		if (STATE_SYNC(commit).current < CONFIG(hashsize)) {
			STATE_SYNC(commit).state = COMMIT_STATE_MASTER;
			/* give it another step as soon as possible */
			write_evfd(STATE_SYNC(commit).evfd);
			return 1;
		}

		/* calculate the time that commit has taken */
		gettimeofday(&commit_stop, NULL);
		timersub(&commit_stop, &STATE_SYNC(commit).stats.start, &res);

		/* calculate new entries committed */
		commit_ok = c->stats.commit_ok - STATE_SYNC(commit).stats.ok;
		commit_fail =
			c->stats.commit_fail - STATE_SYNC(commit).stats.fail;

		/* log results */
		dlog(LOG_NOTICE, "Committed %u new expectations", commit_ok);

		if (commit_fail)
			dlog(LOG_NOTICE, "%u expectations can't be "
					 "committed", commit_fail);

		dlog(LOG_NOTICE, "commit has taken %lu.%06lu seconds",
			res.tv_sec, res.tv_usec);

		/* prepare the state machine for new commits */
		STATE_SYNC(commit).current = 0;
		STATE_SYNC(commit).state = COMMIT_STATE_INACTIVE;

		return 0;
	}
	return 1;
}

static struct nethdr *
cache_exp_build_msg(const struct cache_object *obj, int type)
{
	return BUILD_NETMSG_FROM_EXP(obj->ptr, type);
}

/* template to cache expectations coming from the kernel. */
struct cache_ops cache_sync_internal_exp_ops = {
	.hash		= cache_exp_hash,
	.cmp		= cache_exp_cmp,
	.alloc		= cache_exp_alloc,
	.free		= cache_exp_free,
	.copy		= cache_exp_copy,
	.dump_step	= cache_exp_dump_step,
	.commit		= NULL,
	.build_msg	= cache_exp_build_msg,
};

/* template to cache expectations coming from the network. */
struct cache_ops cache_sync_external_exp_ops = {
	.hash		= cache_exp_hash,
	.cmp		= cache_exp_cmp,
	.alloc		= cache_exp_alloc,
	.free		= cache_exp_free,
	.copy		= cache_exp_copy,
	.dump_step	= cache_exp_dump_step,
	.commit		= cache_exp_commit,
	.build_msg	= NULL,
};