net/bird3: Add new branch 3.x (multithreaded)

Warning: Consider version 3.0.0 to be unstable.

PR:		283403
Sponsored by:	Netflix
This commit is contained in:
Olivier Cochard 2025-01-09 22:58:36 +01:00
parent b11d9c3b91
commit 4516e09a23
26 changed files with 2234 additions and 2 deletions

View file

@ -57,6 +57,7 @@
SUBDIR += bindtest SUBDIR += bindtest
SUBDIR += binkd SUBDIR += binkd
SUBDIR += bird2 SUBDIR += bird2
SUBDIR += bird3
SUBDIR += bittwist SUBDIR += bittwist
SUBDIR += bmon SUBDIR += bmon
SUBDIR += boinc-client SUBDIR += boinc-client

View file

@ -15,8 +15,7 @@ rtsock_PKGNAMESUFFIX= 2-rtsock
USES= bison cpe gmake ncurses readline USES= bison cpe gmake ncurses readline
CONFLICTS= bird CONFLICTS= bird3
CONFLICTS+= bird6
CPE_VENDOR= nic CPE_VENDOR= nic

62
net/bird3/Makefile Normal file
View file

@ -0,0 +1,62 @@
PORTNAME= bird
DISTVERSION= 3.0.0
CATEGORIES= net
MASTER_SITES= https://bird.network.cz/download/
PKGNAMESUFFIX= 3
MAINTAINER= olivier@FreeBSD.org
COMMENT= Dynamic multithreaded IP routing daemon
WWW= https://bird.network.cz/
LICENSE= GPLv2
USES= bison cpe gmake ncurses readline
CONFLICTS= bird2
CPE_VENDOR= nic
USE_CSTD= gnu99
GNU_CONFIGURE= yes
CONFIGURE_ARGS= --localstatedir=/var
USE_RC_SUBR= bird
SUB_FILES= pkg-message
GROUPS= birdvty
MAKE_JOBS_UNSAFE= yes
OPTIONS_MULTI= RP
RP_DESC= Routing Protocols
OPTIONS_MULTI_RP= BFD BABEL BMP BGP MRT OSPF PIPE RADV RIP RPKI STATIC
OPTIONS_DEFAULT= BFD BABEL BGP MRT OSPF PIPE RADV RIP RPKI STATIC
BFD_DESC= Bidirectional Forwarding Detection
BABEL_DESC= Babel routing protocol
BGP_DESC= Border Gateway Protocol
BMP_DESC= BGP Monitoring Protocol
MRT_DESC= Dumping Routing Information in MRT Format
OSPF_DESC= Open Short Path First
PIPE_DESC= PIPE routing
RADV_DESC= Router Advertisement
RIP_DESC= Routing Information Protocol
RPKI_DESC= Resource Public Key Infrastructure
STATIC_DESC= Static routing
BFD_VARS= rt_prot+=bfd
BABEL_VARS= rt_prot+=babel
BGP_VARS= rt_prot+=bgp
BMP_VARS= rt_prot+=bmp
MRT_VARS= rt_prot+=mrt
OSPF_VARS= rt_prot+=ospf
PIPE_VARS= rt_prot+=pipe
RADV_VARS= rt_prot+=radv
RIP_VARS= rt_prot+=rip
RPKI_VARS= rt_prot+=rpki
STATIC_VARS= rt_prot+=static
CONFIGURE_ARGS+=--with-protocols="${RT_PROT}"
CONFIGURE_ARGS+=--with-sysconfig=bsd-netlink
RPKI_LIB_DEPENDS= libssh.so:security/libssh
.include <bsd.port.mk>

3
net/bird3/distinfo Normal file
View file

@ -0,0 +1,3 @@
TIMESTAMP = 1734554961
SHA256 (bird-3.0.0.tar.gz) = 8130440a2e273ba6456df2fb3acb43da7cb4d566f94a294a3a52a1b118f2512a
SIZE (bird-3.0.0.tar.gz) = 2641569

30
net/bird3/files/bird.in Normal file
View file

@ -0,0 +1,30 @@
#!/bin/sh
# PROVIDE: bird dynamicrouting
# REQUIRE: LOGIN
# KEYWORD: shutdown
#
# Add the following lines to /etc/rc.conf.local or /etc/rc.conf
# to enable this service:
#
# bird_enable (bool): Set to NO by default.
# Set it to YES to enable bird.
# bird_config (path): Set to %%PREFIX%%/etc/bird.conf
# by default.
#
. /etc/rc.subr
name="bird"
rcvar=bird_enable
load_rc_config $name
: ${bird_enable="NO"}
: ${bird_config="%%PREFIX%%/etc/bird.conf"}
: ${bird_group="birdvty"}
command=%%PREFIX%%/sbin/${name}
command_args="-c $bird_config -g $bird_group"
run_rc_command "$1"

View file

@ -0,0 +1,38 @@
From b6caccfd45fb639b6dd3a8d140d3c5ba4cc79311 Mon Sep 17 00:00:00 2001
From: Maria Matejka <mq@ucw.cz>
Date: Thu, 19 Dec 2024 11:00:15 +0100
Subject: [PATCH] Kernel: Fix crash for merge paths on if no route is in BIRD
There was a missing check for a NULL return value.
Also fixed an indenting error.
Thanks to Radu Anghel for reporting it:
https://bird.network.cz/pipermail/bird-users/2024-December/017977.html
---
nest/rt-table.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/nest/rt-table.c b/nest/rt-table.c
index fd8bb50dd..05191d743 100644
--- nest/rt-table.c
+++ nest/rt-table.c
@@ -5265,14 +5265,14 @@ krt_export_net(struct channel *c, const net_addr *a, linpool *lp)
if (c->ra_mode == RA_MERGED)
{
struct rt_export_feed *feed = rt_net_feed(c->table, a, NULL);
- if (!feed->count_routes)
+ if (!feed || !feed->count_routes)
return NULL;
if (!bmap_test(&c->export_accepted_map, feed->block[0].id))
return NULL;
return rt_export_merged(c, feed, lp, 1);
- }
+ }
static _Thread_local rte best;
best = rt_net_best(c->table, a);
--
GitLab

View file

@ -0,0 +1,39 @@
From 0a2f92ad205d96d0be0945ecf2bb740b68d5a3c1 Mon Sep 17 00:00:00 2001
From: Maria Matejka <mq@ucw.cz>
Date: Thu, 19 Dec 2024 11:54:05 +0100
Subject: [PATCH] Table: not feeding twice, once is enough
If there is no feed pending, the requested one should be
activated immediately, otherwise it is activated only after
the full run, effectively running first a full feed and
then the requested one.
---
nest/rt-export.c | 12 ++++++++++--
1 file changed, 10 insertions(+), 2 deletions(-)
diff --git a/nest/rt-export.c b/nest/rt-export.c
index 7368447de..7d51e54cf 100644
--- nest/rt-export.c
+++ nest/rt-export.c
@@ -357,8 +357,16 @@ rt_export_refeed_feeder(struct rt_export_feeder *f, struct rt_feeding_request *r
if (!rfr)
return;
- rfr->next = f->feed_pending;
- f->feed_pending = rfr;
+ if (f->feeding)
+ {
+ rfr->next = f->feed_pending;
+ f->feed_pending = rfr;
+ }
+ else
+ {
+ rfr->next = NULL;
+ f->feeding = rfr;
+ }
}
void rt_export_refeed_request(struct rt_export_request *rer, struct rt_feeding_request *rfr)
--
GitLab

View file

@ -0,0 +1,53 @@
From ab74652f96c301dd2d2d2a831dd1a159ae1d5e02 Mon Sep 17 00:00:00 2001
From: Maria Matejka <mq@ucw.cz>
Date: Thu, 19 Dec 2024 12:28:27 +0100
Subject: [PATCH] Kernel: when channel traces, we have to trace the final
result
Otherwise it looks like we are sending too much traffic to netlink
every other while, which is not true. Now we can disambiguate between
in-kernel updates and ignored routes.
---
sysdep/unix/krt.c | 14 +++++++++++---
1 file changed, 11 insertions(+), 3 deletions(-)
diff --git a/sysdep/unix/krt.c b/sysdep/unix/krt.c
index 2770b8be2..34882b88f 100644
--- sysdep/unix/krt.c
+++ sysdep/unix/krt.c
@@ -672,7 +672,7 @@ krt_preexport(struct channel *C, rte *e)
}
static void
-krt_rt_notify(struct proto *P, struct channel *ch UNUSED, const net_addr *net,
+krt_rt_notify(struct proto *P, struct channel *ch, const net_addr *net,
rte *new, const rte *old)
{
struct krt_proto *p = (struct krt_proto *) P;
@@ -688,13 +688,21 @@ krt_rt_notify(struct proto *P, struct channel *ch UNUSED, const net_addr *net,
case KPS_IDLE:
case KPS_PRUNING:
if (new && bmap_test(&p->seen_map, new->id))
- /* Already installed and seen in the kernel dump */
- return;
+ if (ch->debug & D_ROUTES)
+ {
+ /* Already installed and seen in the kernel dump */
+ log(L_TRACE "%s.%s: %N already in kernel",
+ P->name, ch->name, net);
+ return;
+ }
/* fall through */
case KPS_SCANNING:
/* Actually replace the route */
krt_replace_rte(p, net, new, old);
+ if (ch->debug & D_ROUTES)
+ log(L_TRACE "%s.%s: %N %s kernel",
+ P->name, ch->name, net, old ? "replaced in" : "added to");
break;
}
--
GitLab

View file

@ -0,0 +1,176 @@
From 6779e5da698feb9b9e02411859ad81885ba46c01 Mon Sep 17 00:00:00 2001
From: Maria Matejka <mq@ucw.cz>
Date: Fri, 20 Dec 2024 11:28:00 +0100
Subject: [PATCH] BGP: fix locking order error on dynamic protocol spawn
We missed that the protocol spawner violates the prescribed
locking order. When the rtable level is locked, no new protocol can be
started, thus we need to:
* create the protocol from a clean mainloop context
* in protocol start hook, take the socket
Testsuite: cf-bgp-autopeer
Fixes: #136
Thanks to Job Snijders <job@fastly.com> for reporting:
https://trubka.network.cz/pipermail/bird-users/2024-December/017980.html
---
nest/proto.c | 19 +++++++++++++++++++
nest/protocol.h | 2 ++
proto/bgp/bgp.c | 46 +++++++++++++++++++++++++++++++++++-----------
3 files changed, 56 insertions(+), 11 deletions(-)
diff --git a/nest/proto.c b/nest/proto.c
index dded84f51..678697d69 100644
--- nest/proto.c
+++ nest/proto.c
@@ -1867,6 +1867,25 @@ proto_spawn(struct proto_config *cf, uint disabled)
return p;
}
+bool
+proto_disable(struct proto *p)
+{
+ ASSERT_DIE(birdloop_inside(&main_birdloop));
+ bool changed = !p->disabled;
+ p->disabled = 1;
+ proto_rethink_goal(p);
+ return changed;
+}
+
+bool
+proto_enable(struct proto *p)
+{
+ ASSERT_DIE(birdloop_inside(&main_birdloop));
+ bool changed = p->disabled;
+ p->disabled = 0;
+ proto_rethink_goal(p);
+ return changed;
+}
/**
* DOC: Graceful restart recovery
diff --git a/nest/protocol.h b/nest/protocol.h
index 25ed6f553..cf7ecb898 100644
--- nest/protocol.h
+++ nest/protocol.h
@@ -78,6 +78,8 @@ void proto_build(struct protocol *); /* Called from protocol to register itself
void protos_preconfig(struct config *);
void protos_commit(struct config *new, struct config *old, int type);
struct proto * proto_spawn(struct proto_config *cf, uint disabled);
+bool proto_disable(struct proto *p);
+bool proto_enable(struct proto *p);
void protos_dump_all(struct dump_request *);
#define GA_UNKNOWN 0 /* Attribute not recognized */
diff --git a/proto/bgp/bgp.c b/proto/bgp/bgp.c
index 5fc2b5fff..3170e3a42 100644
--- proto/bgp/bgp.c
+++ proto/bgp/bgp.c
@@ -378,8 +378,6 @@ bgp_startup(struct bgp_proto *p)
if (p->postponed_sk)
{
/* Apply postponed incoming connection */
- sk_reloop(p->postponed_sk, p->p.loop);
-
bgp_setup_conn(p, &p->incoming_conn);
bgp_setup_sk(&p->incoming_conn, p->postponed_sk);
bgp_send_open(&p->incoming_conn);
@@ -583,6 +581,9 @@ bgp_graceful_close_conn(struct bgp_conn *conn, int subcode, byte *data, uint len
static void
bgp_down(struct bgp_proto *p)
{
+ /* Check that the dynamic BGP socket has been picked up */
+ ASSERT_DIE(p->postponed_sk == NULL);
+
if (bgp_start_state(p) > BSS_PREPARE)
{
bgp_setup_auth(p, 0);
@@ -617,8 +618,8 @@ bgp_decision(void *vp)
bgp_down(p);
}
-static struct bgp_proto *
-bgp_spawn(struct bgp_proto *pp, ip_addr remote_ip)
+static void
+bgp_spawn(struct bgp_proto *pp, struct birdsock *sk)
{
struct symbol *sym;
char fmt[SYM_MAX_LEN];
@@ -635,9 +636,16 @@ bgp_spawn(struct bgp_proto *pp, ip_addr remote_ip)
cfg_mem = NULL;
/* Just pass remote_ip to bgp_init() */
- ((struct bgp_config *) sym->proto)->remote_ip = remote_ip;
+ ((struct bgp_config *) sym->proto)->remote_ip = sk->daddr;
+
+ /* Create the protocol disabled initially */
+ SKIP_BACK_DECLARE(struct bgp_proto, p, p, proto_spawn(sym->proto, 1));
- return (void *) proto_spawn(sym->proto, 0);
+ /* Pass the socket */
+ p->postponed_sk = sk;
+
+ /* And enable the protocol */
+ proto_enable(&p->p);
}
void
@@ -1489,10 +1497,15 @@ bgp_incoming_connection(sock *sk, uint dummy UNUSED)
/* For dynamic BGP, spawn new instance and postpone the socket */
if (bgp_is_dynamic(p))
{
- p = bgp_spawn(p, sk->daddr);
- p->postponed_sk = sk;
- rmove(sk, p->p.pool);
- goto leave;
+ UNLOCK_DOMAIN(rtable, bgp_listen_domain);
+
+ /* The dynamic protocol must be in the START state */
+ ASSERT_DIE(p->p.proto_state == PS_START);
+ birdloop_leave(p->p.loop);
+
+ /* Now we have a clean mainloop */
+ bgp_spawn(p, sk);
+ return 0;
}
rmove(sk, p->p.pool);
@@ -1806,7 +1819,6 @@ bgp_start(struct proto *P)
p->incoming_conn.state = BS_IDLE;
p->neigh = NULL;
p->bfd_req = NULL;
- p->postponed_sk = NULL;
p->gr_ready = 0;
p->gr_active_num = 0;
@@ -1848,6 +1860,16 @@ bgp_start(struct proto *P)
channel_graceful_restart_lock(&c->c);
}
+ /* Now it's the last chance to move the postponed socket to this BGP,
+ * as bgp_start is the only hook running from main loop. */
+ if (p->postponed_sk)
+ {
+ LOCK_DOMAIN(rtable, bgp_listen_domain);
+ rmove(p->postponed_sk, p->p.pool);
+ sk_reloop(p->postponed_sk, p->p.loop);
+ UNLOCK_DOMAIN(rtable, bgp_listen_domain);
+ }
+
/*
* Before attempting to create the connection, we need to lock the port,
* so that we are the only instance attempting to talk with that neighbor.
@@ -1999,6 +2021,8 @@ bgp_init(struct proto_config *CF)
p->remote_ip = cf->remote_ip;
p->remote_as = cf->remote_as;
+ p->postponed_sk = NULL;
+
/* Hack: We use cf->remote_ip just to pass remote_ip from bgp_spawn() */
if (cf->c.parent)
cf->remote_ip = IPA_NONE;
--
GitLab

View file

@ -0,0 +1,400 @@
From 83495362789d961914c4bfaa590e31cb17370ed0 Mon Sep 17 00:00:00 2001
From: Maria Matejka <mq@ucw.cz>
Date: Sat, 21 Dec 2024 19:02:22 +0100
Subject: [PATCH] BFD: Fix session reconfiguration locking order
The sessions have to be updated asynchronously to avoid
cross-locking between protocols.
Testsuite: cf-ibgp-bfd-switch, cf-ibgp-multi-bfd-auth
Fixes: #139
Thanks to Daniel Suchy <danny@danysek.cz> for reporting:
https://trubka.network.cz/pipermail/bird-users/2024-December/017984.html
---
nest/bfd.h | 7 ++-
proto/bfd/bfd.c | 144 +++++++++++++++++++++++---------------------
proto/bfd/bfd.h | 21 +------
proto/bfd/config.Y | 42 +++++--------
proto/bfd/packets.c | 4 +-
5 files changed, 98 insertions(+), 120 deletions(-)
diff --git a/nest/bfd.h b/nest/bfd.h
index 5dacff5d7..c046152f8 100644
--- nest/bfd.h
+++ nest/bfd.h
@@ -18,8 +18,11 @@ struct bfd_options {
u32 min_tx_int;
u32 idle_tx_int;
u8 multiplier;
- u8 passive;
- u8 passive_set;
+ PACKED enum bfd_opt_passive {
+ BFD_OPT_PASSIVE_UNKNOWN = 0,
+ BFD_OPT_PASSIVE,
+ BFD_OPT_NOT_PASSIVE,
+ } passive;
u8 mode;
u8 auth_type; /* Authentication type (BFD_AUTH_*) */
list *passwords; /* Passwords for authentication */
diff --git a/proto/bfd/bfd.c b/proto/bfd/bfd.c
index 34f992b93..4997f803a 100644
--- proto/bfd/bfd.c
+++ proto/bfd/bfd.c
@@ -172,17 +172,17 @@ static void bfd_free_iface(struct bfd_iface *ifa);
* BFD sessions
*/
-static inline struct bfd_session_config
-bfd_merge_options(const struct bfd_iface_config *cf, const struct bfd_options *opts)
+static inline struct bfd_options
+bfd_merge_options(const struct bfd_options *bottom, const struct bfd_options *top)
{
- return (struct bfd_session_config) {
- .min_rx_int = opts->min_rx_int ?: cf->min_rx_int,
- .min_tx_int = opts->min_tx_int ?: cf->min_tx_int,
- .idle_tx_int = opts->idle_tx_int ?: cf->idle_tx_int,
- .multiplier = opts->multiplier ?: cf->multiplier,
- .passive = opts->passive_set ? opts->passive : cf->passive,
- .auth_type = opts->auth_type ?: cf->auth_type,
- .passwords = opts->passwords ?: cf->passwords,
+ return (struct bfd_options) {
+ .min_rx_int = top->min_rx_int ?: bottom->min_rx_int,
+ .min_tx_int = top->min_tx_int ?: bottom->min_tx_int,
+ .idle_tx_int = top->idle_tx_int ?: bottom->idle_tx_int,
+ .multiplier = top->multiplier ?: bottom->multiplier,
+ .passive = top->passive ?: bottom->passive,
+ .auth_type = top->auth_type ?: bottom->auth_type,
+ .passwords = top->passwords ?: bottom->passwords,
};
}
@@ -478,7 +478,7 @@ bfd_add_session(struct bfd_proto *p, ip_addr addr, ip_addr local, struct iface *
HASH_INSERT(p->session_hash_id, HASH_ID, s);
HASH_INSERT(p->session_hash_ip, HASH_IP, s);
- s->cf = bfd_merge_options(ifa->cf, opts);
+ s->cf = bfd_merge_options(&ifa->cf->opts, opts);
/* Initialization of state variables - see RFC 5880 6.8.1 */
s->loc_state = BFD_STATE_DOWN;
@@ -561,26 +561,58 @@ bfd_remove_session(struct bfd_proto *p, struct bfd_session *s)
birdloop_leave(p->p.loop);
}
+struct bfd_reconfigure_sessions_deferred_call {
+ struct deferred_call dc;
+ struct bfd_proto *p;
+ config_ref old_config;
+};
+
static void
-bfd_reconfigure_session(struct bfd_proto *p, struct bfd_session *s)
+bfd_reconfigure_sessions(struct deferred_call *dc)
{
- if (EMPTY_LIST(s->request_list))
- return;
+ SKIP_BACK_DECLARE(struct bfd_reconfigure_sessions_deferred_call,
+ brsdc, dc, dc);
- ASSERT_DIE(birdloop_inside(p->p.loop));
+ struct bfd_proto *p = brsdc->p;
+ birdloop_enter(p->p.loop);
- SKIP_BACK_DECLARE(struct bfd_request, req, n, HEAD(s->request_list));
- s->cf = bfd_merge_options(s->ifa->cf, &req->opts);
+ HASH_WALK(p->session_hash_id, next_id, s)
+ {
+ if (!EMPTY_LIST(s->request_list))
+ {
+ SKIP_BACK_DECLARE(struct bfd_request, req, n, HEAD(s->request_list));
+ struct bfd_options opts = bfd_merge_options(&s->ifa->cf->opts, &req->opts);
- u32 tx = (s->loc_state == BFD_STATE_UP) ? s->cf.min_tx_int : s->cf.idle_tx_int;
- bfd_session_set_min_tx(s, tx);
- bfd_session_set_min_rx(s, s->cf.min_rx_int);
- s->detect_mult = s->cf.multiplier;
- s->passive = s->cf.passive;
+#define CHK(x) (opts.x != s->cf.x) ||
+ bool reload = MACRO_FOREACH(CHK,
+ min_rx_int,
+ min_tx_int,
+ idle_tx_int,
+ multiplier,
+ passive) false; /* terminating the || chain */
+#undef CHK
- bfd_session_control_tx_timer(s, 0);
+ s->cf = opts;
+
+ if (reload)
+ {
+ u32 tx = (s->loc_state == BFD_STATE_UP) ? s->cf.min_tx_int : s->cf.idle_tx_int;
+ bfd_session_set_min_tx(s, tx);
+ bfd_session_set_min_rx(s, s->cf.min_rx_int);
+ s->detect_mult = s->cf.multiplier;
+ s->passive = s->cf.passive;
+
+ bfd_session_control_tx_timer(s, 0);
+
+ TRACE(D_EVENTS, "Session to %I reconfigured", s->addr);
+ }
+ }
+ }
+ HASH_WALK_END;
+ birdloop_leave(p->p.loop);
- TRACE(D_EVENTS, "Session to %I reconfigured", s->addr);
+ /* Now the config is clean */
+ OBSREF_CLEAR(brsdc->old_config);
}
@@ -589,10 +621,12 @@ bfd_reconfigure_session(struct bfd_proto *p, struct bfd_session *s)
*/
static struct bfd_iface_config bfd_default_iface = {
- .min_rx_int = BFD_DEFAULT_MIN_RX_INT,
- .min_tx_int = BFD_DEFAULT_MIN_TX_INT,
- .idle_tx_int = BFD_DEFAULT_IDLE_TX_INT,
- .multiplier = BFD_DEFAULT_MULTIPLIER,
+ .opts = {
+ .min_rx_int = BFD_DEFAULT_MIN_RX_INT,
+ .min_tx_int = BFD_DEFAULT_MIN_TX_INT,
+ .idle_tx_int = BFD_DEFAULT_IDLE_TX_INT,
+ .multiplier = BFD_DEFAULT_MULTIPLIER,
+ },
};
static inline struct bfd_iface_config *
@@ -650,24 +684,6 @@ bfd_free_iface(struct bfd_iface *ifa)
mb_free(ifa);
}
-static void
-bfd_reconfigure_iface(struct bfd_proto *p UNUSED, struct bfd_iface *ifa, struct bfd_config *nc)
-{
- struct bfd_iface_config *new = bfd_find_iface_config(nc, ifa->iface);
- struct bfd_iface_config *old = ifa->cf;
-
- /* Check options that are handled in bfd_reconfigure_session() */
- ifa->changed =
- (new->min_rx_int != old->min_rx_int) ||
- (new->min_tx_int != old->min_tx_int) ||
- (new->idle_tx_int != old->idle_tx_int) ||
- (new->multiplier != old->multiplier) ||
- (new->passive != old->passive);
-
- /* This should be probably changed to not access ifa->cf from the BFD thread */
- ifa->cf = new;
-}
-
/*
* BFD requests
@@ -900,20 +916,7 @@ bfd_request_session(pool *p, ip_addr addr, ip_addr local,
void
bfd_update_request(struct bfd_request *req, const struct bfd_options *opts)
{
- struct bfd_session *s = req->session;
-
- if (!memcmp(opts, &req->opts, sizeof(const struct bfd_options)))
- return;
-
req->opts = *opts;
-
- if (s)
- {
- struct bfd_proto *p = s->ifa->bfd;
- birdloop_enter(p->p.loop);
- bfd_reconfigure_session(p, s);
- birdloop_leave(p->p.loop);
- }
}
static void
@@ -1193,21 +1196,22 @@ bfd_reconfigure(struct proto *P, struct proto_config *c)
(new->zero_udp6_checksum_rx != old->zero_udp6_checksum_rx))
return 0;
- birdloop_mask_wakeups(p->p.loop);
-
WALK_LIST(ifa, p->iface_list)
- bfd_reconfigure_iface(p, ifa, new);
-
- HASH_WALK(p->session_hash_id, next_id, s)
- {
- if (s->ifa->changed)
- bfd_reconfigure_session(p, s);
- }
- HASH_WALK_END;
+ ifa->cf = bfd_find_iface_config(new, ifa->iface);
bfd_reconfigure_neighbors(p, new);
- birdloop_unmask_wakeups(p->p.loop);
+ /* Sessions get reconfigured after all the config is applied */
+ struct bfd_reconfigure_sessions_deferred_call brsdc = {
+ .dc.hook = bfd_reconfigure_sessions,
+ .p = p,
+ };
+ SKIP_BACK_DECLARE(struct bfd_reconfigure_sessions_deferred_call,
+ brsdcp, dc, defer_call(&brsdc.dc, sizeof brsdc));
+
+ /* We need to keep the old config alive until all the sessions get
+ * reconfigured */
+ OBSREF_SET(brsdcp->old_config, P->cf->global);
return 1;
}
diff --git a/proto/bfd/bfd.h b/proto/bfd/bfd.h
index 578ce8755..107829b72 100644
--- proto/bfd/bfd.h
+++ proto/bfd/bfd.h
@@ -54,24 +54,7 @@ struct bfd_config
struct bfd_iface_config
{
struct iface_patt i;
- u32 min_rx_int;
- u32 min_tx_int;
- u32 idle_tx_int;
- u8 multiplier;
- u8 passive;
- u8 auth_type; /* Authentication type (BFD_AUTH_*) */
- list *passwords; /* Passwords for authentication */
-};
-
-struct bfd_session_config
-{
- u32 min_rx_int;
- u32 min_tx_int;
- u32 idle_tx_int;
- u8 multiplier;
- u8 passive;
- u8 auth_type; /* Authentication type (BFD_AUTH_*) */
- list *passwords; /* Passwords for authentication */
+ struct bfd_options opts;
};
struct bfd_neighbor
@@ -146,7 +129,7 @@ struct bfd_session
u32 loc_id; /* Local session ID (local discriminator) */
u32 rem_id; /* Remote session ID (remote discriminator) */
- struct bfd_session_config cf; /* Static configuration parameters */
+ struct bfd_options cf; /* Static configuration parameters */
u32 des_min_tx_int; /* Desired min rx interval, local option */
u32 des_min_tx_new; /* Used for des_min_tx_int change */
diff --git a/proto/bfd/config.Y b/proto/bfd/config.Y
index 9e9919c4e..56d1ffac4 100644
--- proto/bfd/config.Y
+++ proto/bfd/config.Y
@@ -86,44 +86,37 @@ bfd_iface_start:
add_tail(&BFD_CFG->patt_list, NODE this_ipatt);
init_list(&this_ipatt->ipn_list);
- BFD_IFACE->min_rx_int = BFD_DEFAULT_MIN_RX_INT;
- BFD_IFACE->min_tx_int = BFD_DEFAULT_MIN_TX_INT;
- BFD_IFACE->idle_tx_int = BFD_DEFAULT_IDLE_TX_INT;
- BFD_IFACE->multiplier = BFD_DEFAULT_MULTIPLIER;
+ this_bfd_opts = &BFD_IFACE->opts;
+
+ this_bfd_opts->min_rx_int = BFD_DEFAULT_MIN_RX_INT;
+ this_bfd_opts->min_tx_int = BFD_DEFAULT_MIN_TX_INT;
+ this_bfd_opts->idle_tx_int = BFD_DEFAULT_IDLE_TX_INT;
+ this_bfd_opts->multiplier = BFD_DEFAULT_MULTIPLIER;
reset_passwords();
};
bfd_iface_finish:
{
- BFD_IFACE->passwords = get_passwords();
+ this_bfd_opts->passwords = get_passwords();
- if (!BFD_IFACE->auth_type != !BFD_IFACE->passwords)
+ if (!this_bfd_opts->auth_type != !this_bfd_opts->passwords)
cf_warn("Authentication and password options should be used together");
- if (BFD_IFACE->passwords)
+ if (this_bfd_opts->passwords)
{
struct password_item *pass;
- WALK_LIST(pass, *BFD_IFACE->passwords)
+ WALK_LIST(pass, *this_bfd_opts->passwords)
{
if (pass->alg)
cf_error("Password algorithm option not available in BFD protocol");
- pass->alg = bfd_auth_type_to_hash_alg[BFD_IFACE->auth_type];
+ pass->alg = bfd_auth_type_to_hash_alg[this_bfd_opts->auth_type];
}
}
-};
-bfd_iface_item:
- INTERVAL expr_us { BFD_IFACE->min_rx_int = BFD_IFACE->min_tx_int = $2; }
- | MIN RX INTERVAL expr_us { BFD_IFACE->min_rx_int = $4; }
- | MIN TX INTERVAL expr_us { BFD_IFACE->min_tx_int = $4; }
- | IDLE TX INTERVAL expr_us { BFD_IFACE->idle_tx_int = $4; }
- | MULTIPLIER expr { BFD_IFACE->multiplier = $2; }
- | PASSIVE bool { BFD_IFACE->passive = $2; }
- | AUTHENTICATION bfd_auth_type { BFD_IFACE->auth_type = $2; }
- | password_list {}
- ;
+ this_bfd_opts = NULL;
+};
bfd_auth_type:
NONE { $$ = BFD_AUTH_NONE; }
@@ -134,14 +127,9 @@ bfd_auth_type:
| METICULOUS KEYED SHA1 { $$ = BFD_AUTH_METICULOUS_KEYED_SHA1; }
;
-bfd_iface_opts:
- /* empty */
- | bfd_iface_opts bfd_iface_item ';'
- ;
-
bfd_iface_opt_list:
/* empty */
- | '{' bfd_iface_opts '}'
+ | '{' bfd_items '}'
;
bfd_iface:
@@ -194,7 +182,7 @@ bfd_item:
| MIN TX INTERVAL expr_us { this_bfd_opts->min_tx_int = $4; }
| IDLE TX INTERVAL expr_us { this_bfd_opts->idle_tx_int = $4; }
| MULTIPLIER expr { this_bfd_opts->multiplier = $2; }
- | PASSIVE bool { this_bfd_opts->passive = $2; this_bfd_opts->passive_set = 1; }
+ | PASSIVE bool { this_bfd_opts->passive = $2 ? BFD_OPT_PASSIVE : BFD_OPT_NOT_PASSIVE; }
| GRACEFUL { this_bfd_opts->mode = BGP_BFD_GRACEFUL; }
| AUTHENTICATION bfd_auth_type { this_bfd_opts->auth_type = $2; }
| password_list {}
diff --git a/proto/bfd/packets.c b/proto/bfd/packets.c
index 1ceb470c1..f8bd63d73 100644
--- proto/bfd/packets.c
+++ proto/bfd/packets.c
@@ -109,7 +109,7 @@ const u8 bfd_auth_type_to_hash_alg[] = {
static void
bfd_fill_authentication(struct bfd_proto *p, struct bfd_session *s, struct bfd_ctl_packet *pkt)
{
- struct bfd_session_config *cf = &s->cf;
+ struct bfd_options *cf = &s->cf;
struct password_item *pass = password_find(cf->passwords, 0);
uint meticulous = 0;
@@ -179,7 +179,7 @@ bfd_fill_authentication(struct bfd_proto *p, struct bfd_session *s, struct bfd_c
static int
bfd_check_authentication(struct bfd_proto *p, struct bfd_session *s, struct bfd_ctl_packet *pkt)
{
- struct bfd_session_config *cf = &s->cf;
+ struct bfd_options *cf = &s->cf;
const char *err_dsc = NULL;
uint err_val = 0;
uint auth_type = 0;
--
GitLab

View file

@ -0,0 +1,86 @@
From 3d1f19e335f55c8cfa3cb7ca9d7b88ca03173d8e Mon Sep 17 00:00:00 2001
From: Maria Matejka <mq@ucw.cz>
Date: Sun, 22 Dec 2024 21:32:28 +0100
Subject: [PATCH] Mainloop: Dropped old socket prioritization magic
This is now done in worker threads and the mainloop needs to do other things,
most notably kernel and CLI, with less overhead of repeatedly checking poll.
---
sysdep/unix/io-loop.c | 2 +-
sysdep/unix/io.c | 21 +++++++--------------
2 files changed, 8 insertions(+), 15 deletions(-)
diff --git a/sysdep/unix/io-loop.c b/sysdep/unix/io-loop.c
index f69189e06..a72c69a03 100644
--- sysdep/unix/io-loop.c
+++ sysdep/unix/io-loop.c
@@ -1403,7 +1403,7 @@ bool task_still_in_limit(void)
{
static u64 main_counter = 0;
if (this_birdloop == &main_birdloop)
- return (++main_counter % 2048); /* This is a hack because of no accounting in mainloop */
+ return (++main_counter % 512); /* This is a hack because of no accounting in mainloop */
else
return ns_now() < account_last + this_thread->max_loop_time_ns;
}
diff --git a/sysdep/unix/io.c b/sysdep/unix/io.c
index f9785c074..51395e1e9 100644
--- sysdep/unix/io.c
+++ sysdep/unix/io.c
@@ -53,14 +53,15 @@
/* Maximum number of calls of tx handler for one socket in one
* poll iteration. Should be small enough to not monopolize CPU by
- * one protocol instance.
+ * one protocol instance. But as most of the problems are now offloaded
+ * to worker threads, too low values may actually bring problems with
+ * latency.
*/
-#define MAX_STEPS 4
+#define MAX_STEPS 2048
/* Maximum number of calls of rx handler for all sockets in one poll
- iteration. RX callbacks are often much more costly so we limit
- this to gen small latencies */
-#define MAX_RX_STEPS 4
+ iteration. RX callbacks are often a little bit more costly. */
+#define MAX_RX_STEPS 512
/*
@@ -2581,8 +2582,6 @@ io_init(void)
srandom((uint) (now ^ (now >> 32)));
}
-static int short_loops = 0;
-#define SHORT_LOOP_MAX 10
#define WORK_EVENTS_MAX 10
sock *stored_sock;
@@ -2670,10 +2669,9 @@ io_loop(void)
{
if (pfd.pfd.data[0].revents & POLLIN)
{
- /* IO loop reload requested */
+ /* Somebody sent an event to mainloop */
pipe_drain(&main_birdloop.thread->wakeup);
atomic_fetch_and_explicit(&main_birdloop.thread_transition, ~LTT_PING, memory_order_acq_rel);
- continue;
}
times_update();
@@ -2719,11 +2717,6 @@ io_loop(void)
main_birdloop.sock_active = sk_next(s);
}
- short_loops++;
- if (events && (short_loops < SHORT_LOOP_MAX))
- continue;
- short_loops = 0;
-
int count = 0;
main_birdloop.sock_active = stored_sock;
if (main_birdloop.sock_active == NULL)
--
GitLab

View file

@ -0,0 +1,134 @@
From de9dbee796876f5b621e40e0082612aad746cac1 Mon Sep 17 00:00:00 2001
From: Maria Matejka <mq@ucw.cz>
Date: Sun, 22 Dec 2024 22:10:38 +0100
Subject: [PATCH] CLI: allocate TX buffers as pages, not by malloc
Every malloc risks heap bloating and these blocks are already
the same size as pages.
---
nest/cli.c | 59 ++++++++++++++++++++++++++++++++++++++++++------------
nest/cli.h | 2 +-
2 files changed, 47 insertions(+), 14 deletions(-)
diff --git a/nest/cli.c b/nest/cli.c
index 3b8e6f468..b33ffd437 100644
--- nest/cli.c
+++ nest/cli.c
@@ -81,13 +81,14 @@ cli_alloc_out(cli *c, int size)
o = c->tx_buf;
else
{
- o = mb_alloc(c->pool, sizeof(struct cli_out) + CLI_TX_BUF_SIZE);
+ o = alloc_page();
+ c->tx_pending_count++;
if (c->tx_write)
c->tx_write->next = o;
else
c->tx_buf = o;
o->wpos = o->outpos = o->buf;
- o->end = o->buf + CLI_TX_BUF_SIZE;
+ o->end = (void *) o + page_size;
}
c->tx_write = o;
if (!c->tx_pos)
@@ -167,19 +168,18 @@ cli_hello(cli *c)
static void
cli_free_out(cli *c)
{
- struct cli_out *o, *p;
+ for (struct cli_out *o = c->tx_buf, *n; o; o = n)
+ {
+ n = o->next;
+ free_page(o);
+ c->tx_pending_count--;
+ }
- if (o = c->tx_buf)
- {
- o->wpos = o->outpos = o->buf;
- while (p = o->next)
- {
- o->next = p->next;
- mb_free(p);
- }
- }
+ c->tx_buf = NULL;
c->tx_write = c->tx_pos = NULL;
c->async_msg_size = 0;
+
+ ASSERT_DIE(c->tx_pending_count == 0);
}
void
@@ -189,6 +189,38 @@ cli_written(cli *c)
ev_schedule(c->event);
}
+/* A dummy resource to show and free memory pages allocated for pending TX */
+struct cli_tx_resource {
+ resource r;
+ struct cli *c;
+};
+
+static void
+cli_tx_resource_free(resource *r)
+{
+ cli_free_out(SKIP_BACK(struct cli_tx_resource, r, r)->c);
+}
+
+static void
+cli_tx_resource_dump(struct dump_request *dreq UNUSED, resource *r UNUSED) {}
+
+static struct resmem
+cli_tx_resource_memsize(resource *r)
+{
+ return (struct resmem) {
+ .effective = SKIP_BACK(struct cli_tx_resource, r, r)->c->tx_pending_count * page_size,
+ .overhead = sizeof(struct cli_tx_resource),
+ };
+}
+
+static struct resclass cli_tx_resource_class = {
+ .name = "CLI TX buffers",
+ .size = sizeof (struct cli_tx_resource),
+ .free = cli_tx_resource_free,
+ .dump = cli_tx_resource_dump,
+ .memsize = cli_tx_resource_memsize,
+};
+
static byte *cli_rh_pos;
static uint cli_rh_len;
@@ -272,7 +304,8 @@ cli *
cli_new(struct birdsock *sock, struct cli_config *cf)
{
pool *p = rp_new(cli_pool, the_bird_domain.the_bird, "CLI");
- cli *c = mb_alloc(p, sizeof(cli));
+ struct cli_tx_resource *ctr = ralloc(p, &cli_tx_resource_class);
+ cli *c = ctr->c = mb_alloc(p, sizeof(cli));
bzero(c, sizeof(cli));
c->pool = p;
diff --git a/nest/cli.h b/nest/cli.h
index d86ec3801..671be04d8 100644
--- nest/cli.h
+++ nest/cli.h
@@ -17,7 +17,6 @@
#include "conf/conf.h"
#define CLI_RX_BUF_SIZE 4096
-#define CLI_TX_BUF_SIZE 4096
#define CLI_MAX_ASYNC_QUEUE 4096
#define CLI_MSG_SIZE 500
@@ -49,6 +48,7 @@ typedef struct cli {
uint log_mask; /* Mask of allowed message levels */
uint log_threshold; /* When free < log_threshold, store only important messages */
uint async_msg_size; /* Total size of async messages queued in tx_buf */
+ uint tx_pending_count; /* How many blocks are pending */
} cli;
struct cli_config {
--
GitLab

View file

@ -0,0 +1,29 @@
From 5fd0fd77e293328f354e7f6ed22632ba6ff96593 Mon Sep 17 00:00:00 2001
From: Maria Matejka <mq@ucw.cz>
Date: Sun, 22 Dec 2024 22:26:44 +0100
Subject: [PATCH] CLI: Flushing tmp_linpool after every shown net.
There is no reason to keep the allocated objects through multiple nets.
---
nest/rt-show.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/nest/rt-show.c b/nest/rt-show.c
index 3986da83d..aa9209ca5 100644
--- nest/rt-show.c
+++ nest/rt-show.c
@@ -282,8 +282,9 @@ rt_show_cont(struct cli *c)
rt_show_table(d);
RT_FEED_WALK(&d->tab->req, f)
- if (f->count_routes)
- rt_show_net(d, f);
+ TMP_SAVED
+ if (f->count_routes)
+ rt_show_net(d, f);
if (rt_export_feed_active(&d->tab->req))
rt_feeder_unsubscribe(&d->tab->req);
--
GitLab

View file

@ -0,0 +1,274 @@
From 0fa80d7c79428e5370740a2eba5605b65131ebd6 Mon Sep 17 00:00:00 2001
From: Maria Matejka <mq@ucw.cz>
Date: Mon, 23 Dec 2024 11:58:05 +0100
Subject: [PATCH] Kernel: feed only once during startup
There was an inefficiency in the initial scan state machine,
causing routes to be fed several times instead of just once.
Now the export startup is postponed until first krt_scan()
finishes and we actually can do the pruning with full information.
---
nest/proto.c | 4 ++-
nest/protocol.h | 2 ++
sysdep/unix/krt.c | 69 ++++++++++++++++++++++++++++-------------------
sysdep/unix/krt.h | 5 ++--
4 files changed, 48 insertions(+), 32 deletions(-)
diff --git a/nest/proto.c b/nest/proto.c
index 678697d69..6fa74e9f1 100644
--- nest/proto.c
+++ nest/proto.c
@@ -676,9 +676,11 @@ void channel_notify_basic(void *);
void channel_notify_accepted(void *);
void channel_notify_merged(void *);
-static void
+void
channel_start_export(struct channel *c)
{
+ ASSERT_DIE(birdloop_inside(c->proto->loop));
+
if (rt_export_get_state(&c->out_req) != TES_DOWN)
bug("%s.%s: Attempted to start channel's already started export", c->proto->name, c->name);
diff --git a/nest/protocol.h b/nest/protocol.h
index cf7ecb898..2bfa1628a 100644
--- nest/protocol.h
+++ nest/protocol.h
@@ -747,6 +747,8 @@ int proto_configure_channel(struct proto *p, struct channel **c, struct channel_
void channel_set_state(struct channel *c, uint state);
+void channel_start_export(struct channel *c);
+
void channel_add_obstacle(struct channel *c);
void channel_del_obstacle(struct channel *c);
diff --git a/sysdep/unix/krt.c b/sysdep/unix/krt.c
index 34882b88f..1658dd6fe 100644
--- sysdep/unix/krt.c
+++ sysdep/unix/krt.c
@@ -342,6 +342,8 @@ krt_learn_async(struct krt_proto *p, rte *e, int new)
/* Hook defined in nest/rt-table.c ... to be refactored away later */
rte *krt_export_net(struct channel *c, const net_addr *a, linpool *lp);
+static void krt_rt_notify(struct proto *P, struct channel *ch, const net_addr *net, rte *new, const rte *old);
+
static int
krt_same_dest(rte *k, rte *e)
{
@@ -361,6 +363,11 @@ krt_same_dest(rte *k, rte *e)
void
krt_got_route(struct krt_proto *p, rte *e, s8 src)
{
+ /* If we happen to get an asynchronous route notification
+ * before initialization, we wait for the scan. */
+ if (p->sync_state == KPS_INIT)
+ return;
+
rte *new = NULL;
e->pflags = 0;
@@ -391,10 +398,6 @@ krt_got_route(struct krt_proto *p, rte *e, s8 src)
/* The rest is for KRT_SRC_BIRD (or KRT_SRC_UNKNOWN) */
- /* We wait for the initial feed to have correct installed state */
- if (!p->ready)
- goto ignore;
-
/* Get the exported version */
new = krt_export_net(p->p.main_channel, e->net, krt_filter_lp);
@@ -423,10 +426,6 @@ aseen:
krt_trace_in(p, e, "already seen");
goto done;
-ignore:
- krt_trace_in(p, e, "ignored");
- goto done;
-
update:
krt_trace_in(p, new, "updating");
krt_replace_rte(p, e->net, new, e);
@@ -447,12 +446,21 @@ krt_init_scan(struct krt_proto *p)
{
switch (p->sync_state)
{
+ case KPS_INIT:
+ /* Allow exports now */
+ p->p.rt_notify = krt_rt_notify;
+ channel_start_export(p->p.main_channel);
+ rt_refresh_begin(&p->p.main_channel->in_req);
+ p->sync_state = KPS_FIRST_SCAN;
+ return 1;
+
case KPS_IDLE:
rt_refresh_begin(&p->p.main_channel->in_req);
bmap_reset(&p->seen_map, 1024);
p->sync_state = KPS_SCANNING;
return 1;
+ case KPS_FIRST_SCAN:
case KPS_SCANNING:
bug("Kernel scan double-init");
@@ -470,14 +478,17 @@ krt_prune(struct krt_proto *p)
{
switch (p->sync_state)
{
+ case KPS_INIT:
case KPS_IDLE:
bug("Kernel scan prune without scan");
case KPS_SCANNING:
+ channel_request_full_refeed(p->p.main_channel);
+ /* fall through */
+ case KPS_FIRST_SCAN:
p->sync_state = KPS_PRUNING;
KRT_TRACE(p, D_EVENTS, "Pruning table %s", p->p.main_channel->table->name);
rt_refresh_end(&p->p.main_channel->in_req);
- channel_request_full_refeed(p->p.main_channel);
break;
case KPS_PRUNING:
@@ -549,7 +560,7 @@ krt_scan_all(timer *t UNUSED)
krt_do_scan(NULL);
WALK_LIST2(p, n, krt_proto_list, krt_node)
- if (p->sync_state == KPS_SCANNING)
+ if ((p->sync_state == KPS_SCANNING) || (p->sync_state == KPS_FIRST_SCAN))
krt_prune(p);
}
@@ -644,6 +655,9 @@ krt_scan_timer_kick(struct krt_proto *p)
static int
krt_preexport(struct channel *C, rte *e)
{
+ /* The export should not start before proper sync */
+ ASSERT_DIE(SKIP_BACK(struct krt_proto, p, C->proto)->sync_state != KPS_INIT);
+
if (e->src->owner == &C->proto->sources)
#ifdef CONFIG_SINGLE_ROUTE
return 1;
@@ -659,15 +673,6 @@ krt_preexport(struct channel *C, rte *e)
return -1;
}
- /* Before first scan we don't touch the routes */
- if (!SKIP_BACK(struct krt_proto, p, C->proto)->ready)
- {
- if (C->debug & D_ROUTES)
- log(L_TRACE "%s.%s not ready yet to accept route for %N",
- C->proto->name, C->name, e->net);
- return -1;
- }
-
return 0;
}
@@ -685,18 +690,24 @@ krt_rt_notify(struct proto *P, struct channel *ch, const net_addr *net,
switch (p->sync_state)
{
+ case KPS_INIT:
+ bug("Routes in init state should have been rejected by preexport.");
+
case KPS_IDLE:
case KPS_PRUNING:
if (new && bmap_test(&p->seen_map, new->id))
+ {
if (ch->debug & D_ROUTES)
{
/* Already installed and seen in the kernel dump */
log(L_TRACE "%s.%s: %N already in kernel",
P->name, ch->name, net);
- return;
}
+ return;
+ }
/* fall through */
+ case KPS_FIRST_SCAN:
case KPS_SCANNING:
/* Actually replace the route */
krt_replace_rte(p, net, new, old);
@@ -732,7 +743,6 @@ krt_reload_routes(struct channel *C, struct rt_feeding_request *rfr)
if (KRT_CF->learn)
{
- p->reload = 1;
krt_scan_timer_kick(p);
}
@@ -749,15 +759,18 @@ krt_export_fed(struct channel *C)
{
struct krt_proto *p = (void *) C->proto;
- p->ready = 1;
- p->initialized = 1;
-
switch (p->sync_state)
{
+ case KPS_INIT:
+ bug("KRT export started before scan");
+
case KPS_IDLE:
krt_scan_timer_kick(p);
break;
+ case KPS_FIRST_SCAN:
+ bug("KRT export done before first scan");
+
case KPS_SCANNING:
break;
@@ -831,7 +844,8 @@ krt_init(struct proto_config *CF)
p->p.main_channel = proto_add_channel(&p->p, proto_cf_main_channel(CF));
p->p.preexport = krt_preexport;
- p->p.rt_notify = krt_rt_notify;
+ /* Not setting rt_notify here to not start exports, must wait for the first scan
+ * and then we can start exports manually */
p->p.iface_sub.if_notify = krt_if_notify;
p->p.reload_routes = krt_reload_routes;
p->p.export_fed = krt_export_fed;
@@ -887,7 +901,7 @@ krt_shutdown(struct proto *P)
return PS_FLUSH;
/* FIXME we should flush routes even when persist during reconfiguration */
- if (p->initialized && !KRT_CF->persist && (P->down_code != PDC_CMD_GR_DOWN))
+ if ((p->sync_state != KPS_INIT) && !KRT_CF->persist && (P->down_code != PDC_CMD_GR_DOWN))
{
struct rt_export_feeder req = (struct rt_export_feeder)
{
@@ -922,8 +936,7 @@ krt_shutdown(struct proto *P)
static void
krt_cleanup(struct krt_proto *p)
{
- p->ready = 0;
- p->initialized = 0;
+ p->sync_state = KPS_INIT;
krt_sys_shutdown(p);
rem_node(&p->krt_node);
diff --git a/sysdep/unix/krt.h b/sysdep/unix/krt.h
index 394e74010..14be715f8 100644
--- sysdep/unix/krt.h
+++ sysdep/unix/krt.h
@@ -59,10 +59,9 @@ struct krt_proto {
struct bmap seen_map; /* Routes seen during last periodic scan */
node krt_node; /* Node in krt_proto_list */
byte af; /* Kernel address family (AF_*) */
- byte ready; /* Initial feed has been finished */
- byte initialized; /* First scan has been finished */
- byte reload; /* Next scan is doing reload */
PACKED enum krt_prune_state {
+ KPS_INIT,
+ KPS_FIRST_SCAN,
KPS_IDLE,
KPS_SCANNING,
KPS_PRUNING,
--
GitLab

View file

@ -0,0 +1,311 @@
From f7639a9fafa7411ebd1f2af56c270b970ac09f3d Mon Sep 17 00:00:00 2001
From: Maria Matejka <mq@ucw.cz>
Date: Mon, 23 Dec 2024 21:06:26 +0100
Subject: [PATCH] Graceful recovery: converted to obstacles
Yet another refcounting mechanism had a locking collision.
---
nest/proto.c | 178 ++++++++++++++++++++++++++----------------------
nest/protocol.h | 14 +++-
2 files changed, 110 insertions(+), 82 deletions(-)
diff --git a/nest/proto.c b/nest/proto.c
index 6fa74e9f1..caf99829b 100644
--- nest/proto.c
+++ nest/proto.c
@@ -31,15 +31,8 @@ static list STATIC_LIST_INIT(protocol_list);
#define CD(c, msg, args...) ({ if (c->debug & D_STATES) log(L_TRACE "%s.%s: " msg, c->proto->name, c->name ?: "?", ## args); })
#define PD(p, msg, args...) ({ if (p->debug & D_STATES) log(L_TRACE "%s: " msg, p->name, ## args); })
-static timer *gr_wait_timer;
-
-#define GRS_NONE 0
-#define GRS_INIT 1
-#define GRS_ACTIVE 2
-#define GRS_DONE 3
-
-static int graceful_restart_state;
-static u32 graceful_restart_locks;
+static struct graceful_recovery_context _graceful_recovery_context;
+OBSREF(struct graceful_recovery_context) graceful_recovery_context;
static char *p_states[] = { "DOWN", "START", "UP", "STOP" };
static char *c_states[] = { "DOWN", "START", "UP", "STOP", "RESTART" };
@@ -912,7 +905,7 @@ channel_do_stop(struct channel *c)
ev_postpone(&c->reimport_event);
c->gr_wait = 0;
- if (c->gr_lock)
+ if (OBSREF_GET(c->gr_lock))
channel_graceful_restart_unlock(c);
CALL(c->class->shutdown, c);
@@ -1407,7 +1400,7 @@ proto_start(struct proto *p)
DBG("Kicking %s up\n", p->name);
PD(p, "Starting");
- if (graceful_restart_state == GRS_INIT)
+ if (OBSREF_GET(graceful_recovery_context))
p->gr_recovery = 1;
if (p->cf->loop_order != DOMAIN_ORDER(the_bird))
@@ -1921,7 +1914,45 @@ proto_enable(struct proto *p)
*
*/
-static void graceful_restart_done(timer *t);
+/**
+ * graceful_restart_done - finalize graceful restart
+ * @t: unused
+ *
+ * When there are no locks on graceful restart, the functions finalizes the
+ * graceful restart recovery. Protocols postponing route export until the end of
+ * the recovery are awakened and the export to them is enabled.
+ */
+static void
+graceful_recovery_done(struct callback *_ UNUSED)
+{
+ ASSERT_DIE(birdloop_inside(&main_birdloop));
+ ASSERT_DIE(_graceful_recovery_context.grc_state == GRS_ACTIVE);
+
+ tm_stop(&_graceful_recovery_context.wait_timer);
+ log(L_INFO "Graceful recovery done");
+
+ WALK_TLIST(proto, p, &global_proto_list)
+ PROTO_LOCKED_FROM_MAIN(p)
+ {
+ p->gr_recovery = 0;
+
+ struct channel *c;
+ WALK_LIST(c, p->channels)
+ {
+ ASSERT_DIE(!OBSREF_GET(c->gr_lock));
+
+ /* Resume postponed export of routes */
+ if ((c->channel_state == CS_UP) && c->gr_wait && p->rt_notify)
+ channel_start_export(c);
+
+ /* Cleanup */
+ c->gr_wait = 0;
+ }
+ }
+
+ _graceful_recovery_context.grc_state = GRS_DONE;
+}
+
/**
* graceful_restart_recovery - request initial graceful restart recovery
@@ -1933,7 +1964,30 @@ static void graceful_restart_done(timer *t);
void
graceful_restart_recovery(void)
{
- graceful_restart_state = GRS_INIT;
+ obstacle_target_init(
+ &_graceful_recovery_context.obstacles,
+ &_graceful_recovery_context.obstacles_cleared,
+ &root_pool, "Graceful recovery");
+
+ OBSREF_SET(graceful_recovery_context, &_graceful_recovery_context);
+ _graceful_recovery_context.grc_state = GRS_INIT;
+}
+
+static void
+graceful_recovery_timeout(timer *t UNUSED)
+{
+ log(L_INFO "Graceful recovery timeout");
+ WALK_TLIST(proto, p, &global_proto_list)
+ PROTO_LOCKED_FROM_MAIN(p)
+ {
+ struct channel *c;
+ WALK_LIST(c, p->channels)
+ if (OBSREF_GET(c->gr_lock))
+ {
+ log(L_INFO "Graceful recovery: Not waiting for %s.%s", p->name, c->name);
+ OBSREF_CLEAR(c->gr_lock);
+ }
+ }
}
/**
@@ -1946,73 +2000,35 @@ graceful_restart_recovery(void)
void
graceful_restart_init(void)
{
- if (!graceful_restart_state)
+ if (!OBSREF_GET(graceful_recovery_context))
return;
- log(L_INFO "Graceful restart started");
+ log(L_INFO "Graceful recovery started");
- if (!graceful_restart_locks)
- {
- graceful_restart_done(NULL);
- return;
- }
+ _graceful_recovery_context.grc_state = GRS_ACTIVE;
- graceful_restart_state = GRS_ACTIVE;
- gr_wait_timer = tm_new_init(proto_pool, graceful_restart_done, NULL, 0, 0);
+ _graceful_recovery_context.wait_timer = (timer) { .hook = graceful_recovery_timeout };
u32 gr_wait = atomic_load_explicit(&global_runtime, memory_order_relaxed)->gr_wait;
- tm_start(gr_wait_timer, gr_wait S);
-}
-
-/**
- * graceful_restart_done - finalize graceful restart
- * @t: unused
- *
- * When there are no locks on graceful restart, the functions finalizes the
- * graceful restart recovery. Protocols postponing route export until the end of
- * the recovery are awakened and the export to them is enabled. All other
- * related state is cleared. The function is also called when the graceful
- * restart wait timer fires (but there are still some locks).
- */
-static void
-graceful_restart_done(timer *t)
-{
- log(L_INFO "Graceful restart done");
- graceful_restart_state = GRS_DONE;
-
- WALK_TLIST(proto, p, &global_proto_list)
- {
- if (!p->gr_recovery)
- continue;
-
- struct channel *c;
- WALK_LIST(c, p->channels)
- {
- /* Resume postponed export of routes */
- if ((c->channel_state == CS_UP) && c->gr_wait && p->rt_notify)
- channel_start_export(c);
+ tm_start(&_graceful_recovery_context.wait_timer, gr_wait S);
- /* Cleanup */
- c->gr_wait = 0;
- c->gr_lock = 0;
- }
-
- p->gr_recovery = 0;
- }
+ callback_init(&_graceful_recovery_context.obstacles_cleared, graceful_recovery_done, &main_birdloop);
- graceful_restart_locks = 0;
-
- rfree(t);
+ /* The last clearing of obstacle reference will cause
+ * the graceful recovery finish immediately. */
+ OBSREF_CLEAR(graceful_recovery_context);
}
void
graceful_restart_show_status(void)
{
- if (graceful_restart_state != GRS_ACTIVE)
+ if (_graceful_recovery_context.grc_state != GRS_ACTIVE)
return;
cli_msg(-24, "Graceful restart recovery in progress");
- cli_msg(-24, " Waiting for %d channels to recover", graceful_restart_locks);
- cli_msg(-24, " Wait timer is %t/%u", tm_remains(gr_wait_timer),
+ cli_msg(-24, " Waiting for %u channels to recover",
+ obstacle_target_count(&_graceful_recovery_context.obstacles));
+ cli_msg(-24, " Wait timer is %t/%u",
+ tm_remains(&_graceful_recovery_context.wait_timer),
atomic_load_explicit(&global_runtime, memory_order_relaxed)->gr_wait);
}
@@ -2032,14 +2048,22 @@ graceful_restart_show_status(void)
void
channel_graceful_restart_lock(struct channel *c)
{
- ASSERT(graceful_restart_state == GRS_INIT);
- ASSERT(c->proto->gr_recovery);
+ ASSERT_DIE(birdloop_inside(&main_birdloop));
- if (c->gr_lock)
+ if (OBSREF_GET(c->gr_lock))
return;
- c->gr_lock = 1;
- graceful_restart_locks++;
+ switch (_graceful_recovery_context.grc_state)
+ {
+ case GRS_INIT:
+ case GRS_ACTIVE:
+ OBSREF_SET(c->gr_lock, &_graceful_recovery_context);
+ break;
+
+ case GRS_NONE:
+ case GRS_DONE:
+ break;
+ }
}
/**
@@ -2052,18 +2076,10 @@ channel_graceful_restart_lock(struct channel *c)
void
channel_graceful_restart_unlock(struct channel *c)
{
- if (!c->gr_lock)
- return;
-
- c->gr_lock = 0;
- graceful_restart_locks--;
-
- if ((graceful_restart_state == GRS_ACTIVE) && !graceful_restart_locks)
- tm_start(gr_wait_timer, 0);
+ OBSREF_CLEAR(c->gr_lock);
}
-
/**
* protos_dump_all - dump status of all protocols
*
@@ -2615,9 +2631,9 @@ channel_show_info(struct channel *c)
cli_msg(-1006, " Input filter: %s", filter_name(c->in_filter));
cli_msg(-1006, " Output filter: %s", filter_name(c->out_filter));
- if (graceful_restart_state == GRS_ACTIVE)
+ if (_graceful_recovery_context.grc_state == GRS_ACTIVE)
cli_msg(-1006, " GR recovery: %s%s",
- c->gr_lock ? " pending" : "",
+ OBSREF_GET(c->gr_lock) ? " pending" : "",
c->gr_wait ? " waiting" : "");
channel_show_limit(&c->rx_limit, "Receive limit:", c->limit_active & (1 << PLD_RX), c->limit_actions[PLD_RX]);
diff --git a/nest/protocol.h b/nest/protocol.h
index 2bfa1628a..ec561b263 100644
--- nest/protocol.h
+++ nest/protocol.h
@@ -659,7 +659,7 @@ struct channel {
u8 channel_state;
u8 reloadable; /* Hook reload_routes() is allowed on the channel */
- u8 gr_lock; /* Graceful restart mechanism should wait for this channel */
+ OBSREF(struct graceful_recovery_context) gr_lock; /* Graceful restart mechanism should wait for this channel */
u8 gr_wait; /* Route export to channel is postponed until graceful restart */
u32 obstacles; /* External obstacles remaining before cleanup */
@@ -763,4 +763,16 @@ void *channel_config_new(const struct channel_class *cc, const char *name, uint
void *channel_config_get(const struct channel_class *cc, const char *name, uint net_type, struct proto_config *proto);
int channel_reconfigure(struct channel *c, struct channel_config *cf);
+struct graceful_recovery_context {
+ struct obstacle_target obstacles;
+ struct callback obstacles_cleared;
+ enum {
+ GRS_NONE,
+ GRS_INIT,
+ GRS_ACTIVE,
+ GRS_DONE,
+ } grc_state;
+ timer wait_timer;
+};
+
#endif
--
GitLab

View file

@ -0,0 +1,116 @@
From f6ef8b5b58c674dd270b40aa57d20d2d638c48e9 Mon Sep 17 00:00:00 2001
From: Maria Matejka <mq@ucw.cz>
Date: Tue, 24 Dec 2024 12:18:39 +0100
Subject: [PATCH] Stonehenge: multi-slab allocator
To mid-term allocate and free lots of small blocks in a fast pace,
mb_alloc is too slow and causes heap bloating. We can already allocate
blocks from slabs, and if we allow for a little bit of inefficiency,
we can just use multiple slabs with stepped sizes.
This technique is already used in ea_list allocation which is gonna be
converted to Stonehenge.
---
lib/resource.h | 14 ++++++++++++
lib/slab.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 76 insertions(+)
diff --git a/lib/resource.h b/lib/resource.h
index 48bf1f9ba..12b788510 100644
--- lib/resource.h
+++ lib/resource.h
@@ -139,6 +139,20 @@ void *sl_allocz(slab *);
void sl_free(void *);
void sl_delete(slab *);
+/* A whole stonehenge of slabs */
+
+typedef struct stonehenge stonehenge;
+typedef struct sth_block {
+ void *block;
+ bool large;
+} sth_block;
+
+stonehenge *sth_new(pool *);
+sth_block sth_alloc(stonehenge *, uint size);
+sth_block sth_allocz(stonehenge *, uint size);
+void sth_free(sth_block);
+void sth_delete(stonehenge *);
+
/*
* Low-level memory allocation functions, please don't use
* outside resource manager and possibly sysdep code.
diff --git a/lib/slab.c b/lib/slab.c
index ca971f9fb..d68bfef1e 100644
--- lib/slab.c
+++ lib/slab.c
@@ -469,4 +469,66 @@ slab_lookup(resource *r, unsigned long a)
return NULL;
}
+static const uint stonehenge_sizes[] = { 56, 112, 168, 288, 448, 800, 1344 };
+
+struct stonehenge {
+ pool *p;
+ slab *s[ARRAY_SIZE(stonehenge_sizes)];
+};
+
+sth_block
+sth_alloc(stonehenge *sth, uint size)
+{
+ for (uint i=0; i<ARRAY_SIZE(stonehenge_sizes); i++)
+ if (size <= stonehenge_sizes[i])
+ {
+ if (!sth->s[i])
+ sth->s[i] = sl_new(sth->p, stonehenge_sizes[i]);
+
+ return (sth_block) { .block = sl_alloc(sth->s[i]), };
+ }
+
+ return (sth_block) {
+ .block = mb_alloc(sth->p, size),
+ .large = 1,
+ };
+}
+
+sth_block
+sth_allocz(stonehenge *sth, uint size)
+{
+ sth_block b = sth_alloc(sth, size);
+ bzero(b.block, size);
+ return b;
+}
+
+void
+sth_free(sth_block b)
+{
+ if (b.large)
+ mb_free(b.block);
+ else
+ sl_free(b.block);
+}
+
+stonehenge *
+sth_new(pool *pp)
+{
+ stonehenge tmps = {
+ .p = rp_new(pp, pp->domain, "Stonehenge"),
+ };
+
+ stonehenge *s = sth_alloc(&tmps, sizeof(stonehenge)).block;
+ *s = tmps;
+ return s;
+}
+
+void sth_delete(stonehenge *s)
+{
+ pool *p = s->p;
+ sth_free((sth_block) { s });
+ rp_free(p);
+}
+
+
#endif
--
GitLab

View file

@ -0,0 +1,80 @@
From 8b389a503ef56aa69aa456fabebd562abe247119 Mon Sep 17 00:00:00 2001
From: Maria Matejka <mq@ucw.cz>
Date: Tue, 24 Dec 2024 13:12:58 +0100
Subject: [PATCH] Route attribute storage moved to Stonehenge
---
nest/rt-attr.c | 29 ++++++++---------------------
1 file changed, 8 insertions(+), 21 deletions(-)
diff --git a/nest/rt-attr.c b/nest/rt-attr.c
index a0f7d5718..8d651efb2 100644
--- nest/rt-attr.c
+++ nest/rt-attr.c
@@ -204,9 +204,7 @@ DOMAIN(attrs) attrs_domain;
pool *rta_pool;
-/* Assuming page size of 4096, these are magic values for slab allocation */
-static const uint ea_slab_sizes[] = { 56, 112, 168, 288, 448, 800, 1344 };
-static slab *ea_slab[ARRAY_SIZE(ea_slab_sizes)];
+static stonehenge *ea_sth;
static slab *rte_src_slab;
@@ -1583,24 +1581,18 @@ ea_lookup_slow(ea_list *o, u32 squash_upto, enum ea_stored oid)
return rr;
}
- struct ea_storage *r = NULL;
uint elen = ea_list_size(o);
uint sz = elen + sizeof(struct ea_storage);
- for (uint i=0; i<ARRAY_SIZE(ea_slab_sizes); i++)
- if (sz <= ea_slab_sizes[i])
- {
- r = sl_alloc(ea_slab[i]);
- break;
- }
+ sth_block b = sth_alloc(ea_sth, sz);
- int huge = r ? 0 : EALF_HUGE;;
- if (huge)
- r = mb_alloc(rta_pool, sz);
+ struct ea_storage *r = b.block;
ea_list_copy(r->l, o, elen);
ea_list_ref(r->l);
- r->l->flags |= huge;
+ if (b.large)
+ r->l->flags |= EALF_HUGE;
+
r->l->stored = oid;
r->hash_key = h;
atomic_store_explicit(&r->uc, 1, memory_order_release);
@@ -1668,10 +1660,7 @@ ea_free_deferred(struct deferred_call *dc)
/* And now we can free the object, finally */
ea_list_unref(r->l);
- if (r->l->flags & EALF_HUGE)
- mb_free(r);
- else
- sl_free(r);
+ sth_free((sth_block) { r, !!(r->l->flags & EALF_HUGE) });
RTA_UNLOCK;
}
@@ -1722,9 +1711,7 @@ rta_init(void)
RTA_LOCK;
rta_pool = rp_new(&root_pool, attrs_domain.attrs, "Attributes");
- for (uint i=0; i<ARRAY_SIZE(ea_slab_sizes); i++)
- ea_slab[i] = sl_new(rta_pool, ea_slab_sizes[i]);
-
+ ea_sth = sth_new(rta_pool);
SPINHASH_INIT(rta_hash_table, RTAH, rta_pool, &global_work_list);
rte_src_init();
--
GitLab

View file

@ -0,0 +1,84 @@
From fdb5c4920b45139fb3c37e1144643c0f756364b6 Mon Sep 17 00:00:00 2001
From: Maria Matejka <mq@ucw.cz>
Date: Tue, 24 Dec 2024 13:22:56 +0100
Subject: [PATCH] BGP: TX bucket storage moved to Stonehenge
---
proto/bgp/attrs.c | 11 +++++++----
proto/bgp/bgp.h | 4 ++--
2 files changed, 9 insertions(+), 6 deletions(-)
diff --git a/proto/bgp/attrs.c b/proto/bgp/attrs.c
index a2feaef53..725c469ff 100644
--- proto/bgp/attrs.c
+++ proto/bgp/attrs.c
@@ -1734,13 +1734,16 @@ bgp_get_bucket(struct bgp_ptx_private *c, ea_list *new)
uint size = sizeof(struct bgp_bucket) + ea_size;
/* Allocate the bucket */
- b = mb_alloc(c->pool, size);
+ sth_block blk = sth_alloc(c->sth, size);
+ b = blk.block;
*b = (struct bgp_bucket) { };
init_list(&b->prefixes);
b->hash = hash;
/* Copy the ea_list */
ea_list_copy(b->eattrs, new, ea_size);
+ if (blk.large)
+ b->eattrs->flags |= EALF_HUGE;
/* Insert the bucket to bucket hash */
HASH_INSERT2(c->bucket_hash, RBH, c->pool, b);
@@ -1764,7 +1767,7 @@ static void
bgp_free_bucket(struct bgp_ptx_private *c, struct bgp_bucket *b)
{
HASH_REMOVE2(c->bucket_hash, RBH, c->pool, b);
- mb_free(b);
+ sth_free((sth_block) { b, !!(b->eattrs->flags & EALF_HUGE) });
}
int
@@ -2086,6 +2089,7 @@ bgp_init_pending_tx(struct bgp_channel *c)
bpp->lock = dom;
bpp->pool = p;
+ bpp->sth = sth_new(p);
bpp->c = c;
bgp_init_bucket_table(bpp);
@@ -2160,8 +2164,7 @@ bgp_free_pending_tx(struct bgp_channel *bc)
HASH_WALK_END;
HASH_FREE(c->bucket_hash);
- sl_delete(c->bucket_slab);
- c->bucket_slab = NULL;
+ sth_delete(c->sth);
rp_free(c->pool);
diff --git a/proto/bgp/bgp.h b/proto/bgp/bgp.h
index 202e78ba3..dac6e84ea 100644
--- proto/bgp/bgp.h
+++ proto/bgp/bgp.h
@@ -452,7 +452,8 @@ struct bgp_ptx_private {
struct { BGP_PTX_PUBLIC; };
struct bgp_ptx_private **locked_at;
- pool *pool; /* Resource pool for TX related allocations */
+ pool *pool; /* Pool for infrequent long-term blocks */
+ stonehenge *sth; /* Bucket allocator */
HASH(struct bgp_bucket) bucket_hash; /* Hash table of route buckets */
struct bgp_bucket *withdraw_bucket; /* Withdrawn routes */
@@ -461,7 +462,6 @@ struct bgp_ptx_private {
HASH(struct bgp_prefix) prefix_hash; /* Hash table of pending prefices */
slab *prefix_slab; /* Slab holding prefix nodes */
- slab *bucket_slab; /* Slab holding buckets to send */
char bmp; /* This is a fake ptx for BMP encoding */
};
--
GitLab

View file

@ -0,0 +1,100 @@
From c3c12e1b4ff908211b156a182a5027f2b11b0709 Mon Sep 17 00:00:00 2001
From: Maria Matejka <mq@ucw.cz>
Date: Tue, 24 Dec 2024 16:16:55 +0100
Subject: [PATCH] Allocate the normalization buckets on stack
Even though allocating from tmp_linpool is quite cheap,
it isn't cheap when the block is larger than a page, which is the case here.
Instead, we now allocate just the result which typically fits in a page,
avoiding a necessity of a malloc().
---
nest/rt-attr.c | 37 ++++++++++++++++++++++++-------------
1 file changed, 24 insertions(+), 13 deletions(-)
diff --git a/nest/rt-attr.c b/nest/rt-attr.c
index 8d651efb2..9d5e10980 100644
--- nest/rt-attr.c
+++ nest/rt-attr.c
@@ -967,8 +967,8 @@ ea_list_size(ea_list *o)
* and creates the final structure useful for storage or fast searching.
* The method is a bucket sort.
*
- * Returns the final ea_list with some excess memory at the end,
- * allocated from the tmp_linpool. The adata is linked from the original places.
+ * Returns the final ea_list allocated from the tmp_linpool.
+ * The adata is linked from the original places.
*/
ea_list *
ea_normalize(ea_list *e, u32 upto)
@@ -976,21 +976,17 @@ ea_normalize(ea_list *e, u32 upto)
/* We expect some work to be actually needed. */
ASSERT_DIE(!BIT32_TEST(&upto, e->stored));
- /* Allocate the output */
- ea_list *out = tmp_allocz(ea_class_max * sizeof(eattr) + sizeof(ea_list));
- *out = (ea_list) {
- .flags = EALF_SORTED,
- };
-
+ /* Allocate the buckets locally */
+ eattr *buckets = allocz(ea_class_max * sizeof(eattr));
uint min_id = ~0, max_id = 0;
- eattr *buckets = out->attrs;
+ ea_list *next = NULL;
/* Walk the attribute lists, one after another. */
for (; e; e = e->next)
{
- if (!out->next && BIT32_TEST(&upto, e->stored))
- out->next = e;
+ if (!next && BIT32_TEST(&upto, e->stored))
+ next = e;
for (int i = 0; i < e->count; i++)
{
@@ -1000,7 +996,7 @@ ea_normalize(ea_list *e, u32 upto)
if (id < min_id)
min_id = id;
- if (out->next)
+ if (next)
{
/* Underlay: check whether the value is duplicate */
if (buckets[id].id && buckets[id].fresh)
@@ -1026,6 +1022,18 @@ ea_normalize(ea_list *e, u32 upto)
}
}
+ /* Find out how big the output actually is. */
+ uint len = 0;
+ for (uint id = min_id; id <= max_id; id++)
+ if (buckets[id].id && !(buckets[id].undef && buckets[id].fresh))
+ len++;
+
+ ea_list *out = tmp_alloc(sizeof(ea_list) + len * sizeof(eattr));
+ *out = (ea_list) {
+ .flags = EALF_SORTED,
+ .next = next,
+ };
+
/* And now we just walk the list from beginning to end and collect
* everything to the beginning of the list.
* Walking just that part which is inhabited for sure. */
@@ -1044,9 +1052,12 @@ ea_normalize(ea_list *e, u32 upto)
/* Move the attribute to the beginning */
ASSERT_DIE(out->count < id);
- buckets[out->count++] = buckets[id];
+ ASSERT_DIE(out->count < len);
+ out->attrs[out->count++] = buckets[id];
}
+ ASSERT_DIE(out->count == len);
+
/* We want to bisect only if the list is long enough */
if (out->count > 5)
out->flags |= EALF_BISECT;
--
GitLab

View file

@ -0,0 +1,25 @@
From b58bfcad683f46da9470ad87e8c78e423e04ff97 Mon Sep 17 00:00:00 2001
From: Maria Matejka <mq@ucw.cz>
Date: Fri, 27 Dec 2024 16:22:59 +0100
Subject: [PATCH] BGP: fix display name of bgp_otc attribute
---
proto/bgp/attrs.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/proto/bgp/attrs.c b/proto/bgp/attrs.c
index 725c469ff..5dc06be51 100644
--- proto/bgp/attrs.c
+++ proto/bgp/attrs.c
@@ -1192,7 +1192,7 @@ static union bgp_attr_desc bgp_attr_table[BGP_ATTR_MAX] = {
.decode = bgp_decode_large_community,
},
[BA_ONLY_TO_CUSTOMER] = {
- .name = "otc",
+ .name = "bgp_otc",
.type = T_INT,
.flags = BAF_OPTIONAL | BAF_TRANSITIVE,
.encode = bgp_encode_u32,
--
GitLab

View file

@ -0,0 +1,65 @@
From c5b07695ce810e4345ed1811eadfce935c83b324 Mon Sep 17 00:00:00 2001
From: Maria Matejka <mq@ucw.cz>
Date: Tue, 7 Jan 2025 11:08:04 +0100
Subject: [PATCH] BGP: fixed deterministic med crashes
There were several places of forgotten NULL checks.
Thanks to Alarig Le Lay <alarig@swordarmor.fr> for reporting:
https://trubka.network.cz/pipermail/bird-users/2024-December/017990.html
---
nest/rt-table.c | 14 ++++++++++++--
proto/bgp/attrs.c | 8 ++++----
2 files changed, 16 insertions(+), 6 deletions(-)
diff --git a/nest/rt-table.c b/nest/rt-table.c
index 05191d743..fc6d0d4e0 100644
--- nest/rt-table.c
+++ nest/rt-table.c
@@ -2024,12 +2024,22 @@ rte_recalculate(struct rtable_private *table, struct rt_import_hook *c, struct n
do_recalculate:
/* Add the new route to the list right behind the old one */
if (new_stored)
+ {
+ /* There is the same piece of code several lines farther. Needs refactoring.
+ * The old_stored check is needed because of the possible jump from deterministic med */
+ if (old_stored)
{
atomic_store_explicit(&new_stored->next, atomic_load_explicit(&old_stored->next, memory_order_relaxed), memory_order_release);
atomic_store_explicit(&old_stored->next, new_stored, memory_order_release);
-
- table->rt_count++;
}
+ else
+ {
+ atomic_store_explicit(&new_stored->next, NULL, memory_order_release);
+ atomic_store_explicit(last_ptr, new_stored, memory_order_release);
+ }
+
+ table->rt_count++;
+ }
/* Find a new optimal route (if there is any) */
struct rte_storage * _Atomic *bp = &local_sentinel.next;
diff --git a/proto/bgp/attrs.c b/proto/bgp/attrs.c
index 5dc06be51..db6542343 100644
--- proto/bgp/attrs.c
+++ proto/bgp/attrs.c
@@ -2689,10 +2689,10 @@ bgp_rte_recalculate(struct rtable_private *table, net *net,
struct rte_storage *new_stored, struct rte_storage *old_stored, struct rte_storage *old_best_stored)
{
struct rte_storage *key_stored = new_stored ? new_stored : old_stored;
- const struct rte *new = &new_stored->rte,
- *old = &old_stored->rte,
- *old_best = &old_best_stored->rte,
- *key = &key_stored->rte;
+ const struct rte *new = RTE_OR_NULL(new_stored),
+ *old = RTE_OR_NULL(old_stored),
+ *old_best = RTE_OR_NULL(old_best_stored),
+ *key = RTE_OR_NULL(key_stored);
u32 lpref = rt_get_preference(key);
u32 lasn = bgp_get_neighbor(key);
--
GitLab

View file

@ -0,0 +1,87 @@
From 2e14832d36c83b2ab5b7fb28b701de554fa5fdd9 Mon Sep 17 00:00:00 2001
From: Maria Matejka <mq@ucw.cz>
Date: Tue, 7 Jan 2025 12:13:57 +0100
Subject: [PATCH] Table: old best route refeed fix
When refeeding with RA_OPTIMAL, the old best routes weren't announced,
leading to weird behavior of protocols, mostly kernel. Fixed.
---
nest/rt-table.c | 30 ++++++++++++++++++++++++++----
1 file changed, 26 insertions(+), 4 deletions(-)
diff --git a/nest/rt-table.c b/nest/rt-table.c
index fc6d0d4e0..18a445a62 100644
--- nest/rt-table.c
+++ nest/rt-table.c
@@ -1485,11 +1485,18 @@ channel_notify_basic(void *_channel)
rte *new = &u->feed->block[i];
rte *old = NULL;
for (uint o = oldpos; o < u->feed->count_routes; o++)
- if (new->src == u->feed->block[o].src)
+ if ((c->ra_mode == RA_ANY) && (new->src == u->feed->block[o].src))
{
old = &u->feed->block[o];
break;
}
+ else if ((c->ra_mode == RA_OPTIMAL) && (
+ bmap_test(&c->export_accepted_map, u->feed->block[o].id) ||
+ bmap_test(&c->export_rejected_map, u->feed->block[o].id)))
+ {
+ ASSERT_DIE(!old);
+ old = &u->feed->block[o];
+ }
rt_notify_basic(c, new, old);
@@ -2542,10 +2549,14 @@ rt_feed_net_best(struct rt_exporter *e, struct rcu_unwinder *u, u32 index, bool
last_in_net = atomic_load_explicit(&n->best.last, memory_order_acquire);
first = rt_net_feed_validate_first(tr, first_in_net, last_in_net, first);
- uint ecnt = 0;
+ uint ecnt = 0, ocnt = 0;
for (const struct rt_pending_export *rpe = first; rpe;
rpe = atomic_load_explicit(&rpe->next, memory_order_acquire))
+ {
ecnt++;
+ if (rpe->it.old)
+ ocnt++;
+ }
if (ecnt) {
const net_addr *a = (first->it.new ?: first->it.old)->net;
@@ -2558,10 +2569,11 @@ rt_feed_net_best(struct rt_exporter *e, struct rcu_unwinder *u, u32 index, bool
if (!ecnt && (!best || prefilter && !prefilter(f, best->rte.net)))
return NULL;
- struct rt_export_feed *feed = rt_alloc_feed(!!best, ecnt);
+ struct rt_export_feed *feed = rt_alloc_feed(!!best + ocnt, ecnt);
+ uint bpos = 0;
if (best)
{
- feed->block[0] = best->rte;
+ feed->block[bpos++] = best->rte;
feed->ni = NET_TO_INDEX(best->rte.net);
}
else
@@ -2575,8 +2587,18 @@ rt_feed_net_best(struct rt_exporter *e, struct rcu_unwinder *u, u32 index, bool
if (e >= ecnt)
RT_READ_RETRY(tr);
else
+ {
feed->exports[e++] = rpe->it.seq;
+ if (rpe->it.old)
+ {
+ ASSERT_DIE(bpos < !!best + ocnt);
+ feed->block[bpos] = *rpe->it.old;
+ feed->block[bpos].flags |= REF_OBSOLETE;
+ bpos++;
+ }
+ }
+ ASSERT_DIE(bpos == !!best + ocnt);
ASSERT_DIE(e == ecnt);
}
--
GitLab

View file

@ -0,0 +1,11 @@
--- Makefile.in.orig 2022-02-21 11:12:46 UTC
+++ Makefile.in
@@ -207,7 +207,7 @@ install: all
$(INSTALL_PROGRAM) $(exedir)/$$BIN $(DESTDIR)/$(sbindir)/$$BIN ; \
done
if ! test -f $(DESTDIR)/@CONFIG_FILE@ ; then \
- $(INSTALL_DATA) $(srcdir)/doc/bird.conf.example $(DESTDIR)/@CONFIG_FILE@ ; \
+ $(INSTALL_DATA) $(srcdir)/doc/bird.conf.example $(DESTDIR)/@CONFIG_FILE@.sample ; \
else \
echo "Not overwriting old bird.conf" ; \
fi

View file

@ -0,0 +1,11 @@
[
{ type: install
message: <<EOM
%%LOCALBASE%%/sbin/birdc was added to /etc/shells
To assign it to a user, the user needs to be in group wheel.
Alternatively you could use security/sudo to grant a user access to it,
which would allow '%%LOCALBASE%%/sbin/birdc -r' to restrict to read-only.
EOM
}
]

14
net/bird3/pkg-descr Normal file
View file

@ -0,0 +1,14 @@
The BIRD project aims to develop a fully functional dynamic IP routing daemon.
- Both IPv4 and IPv6
- Multiple routing tables
- BGP
- RIP
- OSPF
- LDP
- L3VPN
- Static routes
- Inter-table protocol
- Command-line interface
- Soft reconfiguration
- Powerful language for route filtering
Starting with release 3.x, BIRD is multi-threaded.

4
net/bird3/pkg-plist Normal file
View file

@ -0,0 +1,4 @@
@sample etc/bird.conf.sample
@shell sbin/birdc
sbin/bird
sbin/birdcl