2 * Licensed under the Apache License, Version 2.0 (the "License");
3 * you may not use this file except in compliance with the License.
4 * You may obtain a copy of the License at:
6 * http://www.apache.org/licenses/LICENSE-2.0
8 * Unless required by applicable law or agreed to in writing, software
9 * distributed under the License is distributed on an "AS IS" BASIS,
10 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 * See the License for the specific language governing permissions and
12 * limitations under the License.
21 #include "command-line.h"
24 #include "openvswitch/dynamic-string.h"
25 #include "fatal-signal.h"
29 #include "ovn/lib/lex.h"
30 #include "ovn/lib/ovn-nb-idl.h"
31 #include "ovn/lib/ovn-sb-idl.h"
32 #include "ovn/lib/ovn-util.h"
34 #include "poll-loop.h"
38 #include "stream-ssl.h"
42 #include "openvswitch/vlog.h"
44 VLOG_DEFINE_THIS_MODULE(ovn_northd);
46 static unixctl_cb_func ovn_northd_exit;
48 struct northd_context {
49 struct ovsdb_idl *ovnnb_idl;
50 struct ovsdb_idl *ovnsb_idl;
51 struct ovsdb_idl_txn *ovnnb_txn;
52 struct ovsdb_idl_txn *ovnsb_txn;
55 static const char *ovnnb_db;
56 static const char *ovnsb_db;
58 static const char *default_nb_db(void);
59 static const char *default_sb_db(void);
61 /* Pipeline stages. */
63 /* The two pipelines in an OVN logical flow table. */
65 P_IN, /* Ingress pipeline. */
66 P_OUT /* Egress pipeline. */
69 /* The two purposes for which ovn-northd uses OVN logical datapaths. */
70 enum ovn_datapath_type {
71 DP_SWITCH, /* OVN logical switch. */
72 DP_ROUTER /* OVN logical router. */
75 /* Returns an "enum ovn_stage" built from the arguments.
77 * (It's better to use ovn_stage_build() for type-safety reasons, but inline
78 * functions can't be used in enums or switch cases.) */
79 #define OVN_STAGE_BUILD(DP_TYPE, PIPELINE, TABLE) \
80 (((DP_TYPE) << 9) | ((PIPELINE) << 8) | (TABLE))
82 /* A stage within an OVN logical switch or router.
84 * An "enum ovn_stage" indicates whether the stage is part of a logical switch
85 * or router, whether the stage is part of the ingress or egress pipeline, and
86 * the table within that pipeline. The first three components are combined to
87 * form the stage's full name, e.g. S_SWITCH_IN_PORT_SEC_L2,
88 * S_ROUTER_OUT_DELIVERY. */
90 #define PIPELINE_STAGES \
91 /* Logical switch ingress stages. */ \
92 PIPELINE_STAGE(SWITCH, IN, PORT_SEC_L2, 0, "ls_in_port_sec_l2") \
93 PIPELINE_STAGE(SWITCH, IN, PORT_SEC_IP, 1, "ls_in_port_sec_ip") \
94 PIPELINE_STAGE(SWITCH, IN, PORT_SEC_ND, 2, "ls_in_port_sec_nd") \
95 PIPELINE_STAGE(SWITCH, IN, PRE_ACL, 3, "ls_in_pre_acl") \
96 PIPELINE_STAGE(SWITCH, IN, PRE_LB, 4, "ls_in_pre_lb") \
97 PIPELINE_STAGE(SWITCH, IN, PRE_STATEFUL, 5, "ls_in_pre_stateful") \
98 PIPELINE_STAGE(SWITCH, IN, ACL, 6, "ls_in_acl") \
99 PIPELINE_STAGE(SWITCH, IN, LB, 7, "ls_in_lb") \
100 PIPELINE_STAGE(SWITCH, IN, STATEFUL, 8, "ls_in_stateful") \
101 PIPELINE_STAGE(SWITCH, IN, ARP_ND_RSP, 9, "ls_in_arp_rsp") \
102 PIPELINE_STAGE(SWITCH, IN, L2_LKUP, 10, "ls_in_l2_lkup") \
104 /* Logical switch egress stages. */ \
105 PIPELINE_STAGE(SWITCH, OUT, PRE_LB, 0, "ls_out_pre_lb") \
106 PIPELINE_STAGE(SWITCH, OUT, PRE_ACL, 1, "ls_out_pre_acl") \
107 PIPELINE_STAGE(SWITCH, OUT, PRE_STATEFUL, 2, "ls_out_pre_stateful") \
108 PIPELINE_STAGE(SWITCH, OUT, LB, 3, "ls_out_lb") \
109 PIPELINE_STAGE(SWITCH, OUT, ACL, 4, "ls_out_acl") \
110 PIPELINE_STAGE(SWITCH, OUT, STATEFUL, 5, "ls_out_stateful") \
111 PIPELINE_STAGE(SWITCH, OUT, PORT_SEC_IP, 6, "ls_out_port_sec_ip") \
112 PIPELINE_STAGE(SWITCH, OUT, PORT_SEC_L2, 7, "ls_out_port_sec_l2") \
114 /* Logical router ingress stages. */ \
115 PIPELINE_STAGE(ROUTER, IN, ADMISSION, 0, "lr_in_admission") \
116 PIPELINE_STAGE(ROUTER, IN, IP_INPUT, 1, "lr_in_ip_input") \
117 PIPELINE_STAGE(ROUTER, IN, UNSNAT, 2, "lr_in_unsnat") \
118 PIPELINE_STAGE(ROUTER, IN, DNAT, 3, "lr_in_dnat") \
119 PIPELINE_STAGE(ROUTER, IN, IP_ROUTING, 4, "lr_in_ip_routing") \
120 PIPELINE_STAGE(ROUTER, IN, ARP_RESOLVE, 5, "lr_in_arp_resolve") \
121 PIPELINE_STAGE(ROUTER, IN, ARP_REQUEST, 6, "lr_in_arp_request") \
123 /* Logical router egress stages. */ \
124 PIPELINE_STAGE(ROUTER, OUT, SNAT, 0, "lr_out_snat") \
125 PIPELINE_STAGE(ROUTER, OUT, DELIVERY, 1, "lr_out_delivery")
127 #define PIPELINE_STAGE(DP_TYPE, PIPELINE, STAGE, TABLE, NAME) \
128 S_##DP_TYPE##_##PIPELINE##_##STAGE \
129 = OVN_STAGE_BUILD(DP_##DP_TYPE, P_##PIPELINE, TABLE),
131 #undef PIPELINE_STAGE
134 /* Due to various hard-coded priorities need to implement ACLs, the
135 * northbound database supports a smaller range of ACL priorities than
136 * are available to logical flows. This value is added to an ACL
137 * priority to determine the ACL's logical flow priority. */
138 #define OVN_ACL_PRI_OFFSET 1000
140 #define REGBIT_CONNTRACK_DEFRAG "reg0[0]"
141 #define REGBIT_CONNTRACK_COMMIT "reg0[1]"
142 #define REGBIT_CONNTRACK_NAT "reg0[2]"
144 /* Returns an "enum ovn_stage" built from the arguments. */
145 static enum ovn_stage
146 ovn_stage_build(enum ovn_datapath_type dp_type, enum ovn_pipeline pipeline,
149 return OVN_STAGE_BUILD(dp_type, pipeline, table);
152 /* Returns the pipeline to which 'stage' belongs. */
153 static enum ovn_pipeline
154 ovn_stage_get_pipeline(enum ovn_stage stage)
156 return (stage >> 8) & 1;
159 /* Returns the table to which 'stage' belongs. */
161 ovn_stage_get_table(enum ovn_stage stage)
166 /* Returns a string name for 'stage'. */
168 ovn_stage_to_str(enum ovn_stage stage)
171 #define PIPELINE_STAGE(DP_TYPE, PIPELINE, STAGE, TABLE, NAME) \
172 case S_##DP_TYPE##_##PIPELINE##_##STAGE: return NAME;
174 #undef PIPELINE_STAGE
175 default: return "<unknown>";
179 /* Returns the type of the datapath to which a flow with the given 'stage' may
181 static enum ovn_datapath_type
182 ovn_stage_to_datapath_type(enum ovn_stage stage)
185 #define PIPELINE_STAGE(DP_TYPE, PIPELINE, STAGE, TABLE, NAME) \
186 case S_##DP_TYPE##_##PIPELINE##_##STAGE: return DP_##DP_TYPE;
188 #undef PIPELINE_STAGE
189 default: OVS_NOT_REACHED();
197 %s: OVN northbound management daemon\n\
198 usage: %s [OPTIONS]\n\
201 --ovnnb-db=DATABASE connect to ovn-nb database at DATABASE\n\
203 --ovnsb-db=DATABASE connect to ovn-sb database at DATABASE\n\
205 -h, --help display this help message\n\
206 -o, --options list available options\n\
207 -V, --version display version information\n\
208 ", program_name, program_name, default_nb_db(), default_sb_db());
211 stream_usage("database", true, true, false);
215 struct hmap_node hmap_node;
220 destroy_tnlids(struct hmap *tnlids)
222 struct tnlid_node *node;
223 HMAP_FOR_EACH_POP (node, hmap_node, tnlids) {
226 hmap_destroy(tnlids);
230 add_tnlid(struct hmap *set, uint32_t tnlid)
232 struct tnlid_node *node = xmalloc(sizeof *node);
233 hmap_insert(set, &node->hmap_node, hash_int(tnlid, 0));
238 tnlid_in_use(const struct hmap *set, uint32_t tnlid)
240 const struct tnlid_node *node;
241 HMAP_FOR_EACH_IN_BUCKET (node, hmap_node, hash_int(tnlid, 0), set) {
242 if (node->tnlid == tnlid) {
250 allocate_tnlid(struct hmap *set, const char *name, uint32_t max,
253 for (uint32_t tnlid = *hint + 1; tnlid != *hint;
254 tnlid = tnlid + 1 <= max ? tnlid + 1 : 1) {
255 if (!tnlid_in_use(set, tnlid)) {
256 add_tnlid(set, tnlid);
262 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
263 VLOG_WARN_RL(&rl, "all %s tunnel ids exhausted", name);
267 /* The 'key' comes from nbs->header_.uuid or nbr->header_.uuid or
268 * sb->external_ids:logical-switch. */
269 struct ovn_datapath {
270 struct hmap_node key_node; /* Index on 'key'. */
271 struct uuid key; /* (nbs/nbr)->header_.uuid. */
273 const struct nbrec_logical_switch *nbs; /* May be NULL. */
274 const struct nbrec_logical_router *nbr; /* May be NULL. */
275 const struct sbrec_datapath_binding *sb; /* May be NULL. */
277 struct ovs_list list; /* In list of similar records. */
279 /* Logical switch data. */
280 struct ovn_port **router_ports;
281 size_t n_router_ports;
283 struct hmap port_tnlids;
284 uint32_t port_key_hint;
289 static struct ovn_datapath *
290 ovn_datapath_create(struct hmap *datapaths, const struct uuid *key,
291 const struct nbrec_logical_switch *nbs,
292 const struct nbrec_logical_router *nbr,
293 const struct sbrec_datapath_binding *sb)
295 struct ovn_datapath *od = xzalloc(sizeof *od);
300 hmap_init(&od->port_tnlids);
301 od->port_key_hint = 0;
302 hmap_insert(datapaths, &od->key_node, uuid_hash(&od->key));
307 ovn_datapath_destroy(struct hmap *datapaths, struct ovn_datapath *od)
310 /* Don't remove od->list. It is used within build_datapaths() as a
311 * private list and once we've exited that function it is not safe to
313 hmap_remove(datapaths, &od->key_node);
314 destroy_tnlids(&od->port_tnlids);
315 free(od->router_ports);
320 /* Returns 'od''s datapath type. */
321 static enum ovn_datapath_type
322 ovn_datapath_get_type(const struct ovn_datapath *od)
324 return od->nbs ? DP_SWITCH : DP_ROUTER;
327 static struct ovn_datapath *
328 ovn_datapath_find(struct hmap *datapaths, const struct uuid *uuid)
330 struct ovn_datapath *od;
332 HMAP_FOR_EACH_WITH_HASH (od, key_node, uuid_hash(uuid), datapaths) {
333 if (uuid_equals(uuid, &od->key)) {
340 static struct ovn_datapath *
341 ovn_datapath_from_sbrec(struct hmap *datapaths,
342 const struct sbrec_datapath_binding *sb)
346 if (!smap_get_uuid(&sb->external_ids, "logical-switch", &key) &&
347 !smap_get_uuid(&sb->external_ids, "logical-router", &key)) {
350 return ovn_datapath_find(datapaths, &key);
354 lrouter_is_enabled(const struct nbrec_logical_router *lrouter)
356 return !lrouter->enabled || *lrouter->enabled;
360 join_datapaths(struct northd_context *ctx, struct hmap *datapaths,
361 struct ovs_list *sb_only, struct ovs_list *nb_only,
362 struct ovs_list *both)
364 hmap_init(datapaths);
365 ovs_list_init(sb_only);
366 ovs_list_init(nb_only);
369 const struct sbrec_datapath_binding *sb, *sb_next;
370 SBREC_DATAPATH_BINDING_FOR_EACH_SAFE (sb, sb_next, ctx->ovnsb_idl) {
372 if (!smap_get_uuid(&sb->external_ids, "logical-switch", &key) &&
373 !smap_get_uuid(&sb->external_ids, "logical-router", &key)) {
374 ovsdb_idl_txn_add_comment(
376 "deleting Datapath_Binding "UUID_FMT" that lacks "
377 "external-ids:logical-switch and "
378 "external-ids:logical-router",
379 UUID_ARGS(&sb->header_.uuid));
380 sbrec_datapath_binding_delete(sb);
384 if (ovn_datapath_find(datapaths, &key)) {
385 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
387 &rl, "deleting Datapath_Binding "UUID_FMT" with "
388 "duplicate external-ids:logical-switch/router "UUID_FMT,
389 UUID_ARGS(&sb->header_.uuid), UUID_ARGS(&key));
390 sbrec_datapath_binding_delete(sb);
394 struct ovn_datapath *od = ovn_datapath_create(datapaths, &key,
396 ovs_list_push_back(sb_only, &od->list);
399 const struct nbrec_logical_switch *nbs;
400 NBREC_LOGICAL_SWITCH_FOR_EACH (nbs, ctx->ovnnb_idl) {
401 struct ovn_datapath *od = ovn_datapath_find(datapaths,
405 ovs_list_remove(&od->list);
406 ovs_list_push_back(both, &od->list);
408 od = ovn_datapath_create(datapaths, &nbs->header_.uuid,
410 ovs_list_push_back(nb_only, &od->list);
414 const struct nbrec_logical_router *nbr;
415 NBREC_LOGICAL_ROUTER_FOR_EACH (nbr, ctx->ovnnb_idl) {
416 if (!lrouter_is_enabled(nbr)) {
420 struct ovn_datapath *od = ovn_datapath_find(datapaths,
425 ovs_list_remove(&od->list);
426 ovs_list_push_back(both, &od->list);
429 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
431 "duplicate UUID "UUID_FMT" in OVN_Northbound",
432 UUID_ARGS(&nbr->header_.uuid));
436 od = ovn_datapath_create(datapaths, &nbr->header_.uuid,
438 ovs_list_push_back(nb_only, &od->list);
444 ovn_datapath_allocate_key(struct hmap *dp_tnlids)
446 static uint32_t hint;
447 return allocate_tnlid(dp_tnlids, "datapath", (1u << 24) - 1, &hint);
450 /* Updates the southbound Datapath_Binding table so that it contains the
451 * logical switches and routers specified by the northbound database.
453 * Initializes 'datapaths' to contain a "struct ovn_datapath" for every logical
454 * switch and router. */
456 build_datapaths(struct northd_context *ctx, struct hmap *datapaths)
458 struct ovs_list sb_only, nb_only, both;
460 join_datapaths(ctx, datapaths, &sb_only, &nb_only, &both);
462 if (!ovs_list_is_empty(&nb_only)) {
463 /* First index the in-use datapath tunnel IDs. */
464 struct hmap dp_tnlids = HMAP_INITIALIZER(&dp_tnlids);
465 struct ovn_datapath *od;
466 LIST_FOR_EACH (od, list, &both) {
467 add_tnlid(&dp_tnlids, od->sb->tunnel_key);
470 /* Add southbound record for each unmatched northbound record. */
471 LIST_FOR_EACH (od, list, &nb_only) {
472 uint16_t tunnel_key = ovn_datapath_allocate_key(&dp_tnlids);
477 od->sb = sbrec_datapath_binding_insert(ctx->ovnsb_txn);
479 char uuid_s[UUID_LEN + 1];
480 sprintf(uuid_s, UUID_FMT, UUID_ARGS(&od->key));
481 const char *key = od->nbs ? "logical-switch" : "logical-router";
482 const struct smap id = SMAP_CONST1(&id, key, uuid_s);
483 sbrec_datapath_binding_set_external_ids(od->sb, &id);
485 sbrec_datapath_binding_set_tunnel_key(od->sb, tunnel_key);
487 destroy_tnlids(&dp_tnlids);
490 /* Delete southbound records without northbound matches. */
491 struct ovn_datapath *od, *next;
492 LIST_FOR_EACH_SAFE (od, next, list, &sb_only) {
493 ovs_list_remove(&od->list);
494 sbrec_datapath_binding_delete(od->sb);
495 ovn_datapath_destroy(datapaths, od);
500 struct hmap_node key_node; /* Index on 'key'. */
501 char *key; /* nbs->name, nbr->name, sb->logical_port. */
502 char *json_key; /* 'key', quoted for use in JSON. */
504 const struct sbrec_port_binding *sb; /* May be NULL. */
506 /* Logical switch port data. */
507 const struct nbrec_logical_switch_port *nbs; /* May be NULL. */
509 struct lport_addresses *lsp_addrs; /* Logical switch port addresses. */
510 unsigned int n_lsp_addrs;
512 struct lport_addresses *ps_addrs; /* Port security addresses. */
513 unsigned int n_ps_addrs;
515 /* Logical router port data. */
516 const struct nbrec_logical_router_port *nbr; /* May be NULL. */
518 struct lport_addresses lrp_networks;
520 struct ovn_port *peer;
522 struct ovn_datapath *od;
524 struct ovs_list list; /* In list of similar records. */
527 static struct ovn_port *
528 ovn_port_create(struct hmap *ports, const char *key,
529 const struct nbrec_logical_switch_port *nbs,
530 const struct nbrec_logical_router_port *nbr,
531 const struct sbrec_port_binding *sb)
533 struct ovn_port *op = xzalloc(sizeof *op);
535 struct ds json_key = DS_EMPTY_INITIALIZER;
536 json_string_escape(key, &json_key);
537 op->json_key = ds_steal_cstr(&json_key);
539 op->key = xstrdup(key);
543 hmap_insert(ports, &op->key_node, hash_string(op->key, 0));
548 ovn_port_destroy(struct hmap *ports, struct ovn_port *port)
551 /* Don't remove port->list. It is used within build_ports() as a
552 * private list and once we've exited that function it is not safe to
554 hmap_remove(ports, &port->key_node);
556 for (int i = 0; i < port->n_lsp_addrs; i++) {
557 destroy_lport_addresses(&port->lsp_addrs[i]);
559 free(port->lsp_addrs);
561 for (int i = 0; i < port->n_ps_addrs; i++) {
562 destroy_lport_addresses(&port->ps_addrs[i]);
564 free(port->ps_addrs);
566 destroy_lport_addresses(&port->lrp_networks);
567 free(port->json_key);
573 static struct ovn_port *
574 ovn_port_find(struct hmap *ports, const char *name)
578 HMAP_FOR_EACH_WITH_HASH (op, key_node, hash_string(name, 0), ports) {
579 if (!strcmp(op->key, name)) {
587 ovn_port_allocate_key(struct ovn_datapath *od)
589 return allocate_tnlid(&od->port_tnlids, "port",
590 (1u << 15) - 1, &od->port_key_hint);
594 join_logical_ports(struct northd_context *ctx,
595 struct hmap *datapaths, struct hmap *ports,
596 struct ovs_list *sb_only, struct ovs_list *nb_only,
597 struct ovs_list *both)
600 ovs_list_init(sb_only);
601 ovs_list_init(nb_only);
604 const struct sbrec_port_binding *sb;
605 SBREC_PORT_BINDING_FOR_EACH (sb, ctx->ovnsb_idl) {
606 struct ovn_port *op = ovn_port_create(ports, sb->logical_port,
608 ovs_list_push_back(sb_only, &op->list);
611 struct ovn_datapath *od;
612 HMAP_FOR_EACH (od, key_node, datapaths) {
614 for (size_t i = 0; i < od->nbs->n_ports; i++) {
615 const struct nbrec_logical_switch_port *nbs = od->nbs->ports[i];
616 struct ovn_port *op = ovn_port_find(ports, nbs->name);
618 if (op->nbs || op->nbr) {
619 static struct vlog_rate_limit rl
620 = VLOG_RATE_LIMIT_INIT(5, 1);
621 VLOG_WARN_RL(&rl, "duplicate logical port %s",
626 ovs_list_remove(&op->list);
627 ovs_list_push_back(both, &op->list);
629 /* This port exists due to a SB binding, but should
630 * not have been initialized fully. */
631 ovs_assert(!op->n_lsp_addrs && !op->n_ps_addrs);
633 op = ovn_port_create(ports, nbs->name, nbs, NULL, NULL);
634 ovs_list_push_back(nb_only, &op->list);
638 = xmalloc(sizeof *op->lsp_addrs * nbs->n_addresses);
639 for (size_t j = 0; j < nbs->n_addresses; j++) {
640 if (!strcmp(nbs->addresses[j], "unknown")) {
643 if (!extract_lsp_addresses(nbs->addresses[j],
644 &op->lsp_addrs[op->n_lsp_addrs])) {
645 static struct vlog_rate_limit rl
646 = VLOG_RATE_LIMIT_INIT(1, 1);
647 VLOG_INFO_RL(&rl, "invalid syntax '%s' in logical "
648 "switch port addresses. No MAC "
650 op->nbs->addresses[j]);
657 = xmalloc(sizeof *op->ps_addrs * nbs->n_port_security);
658 for (size_t j = 0; j < nbs->n_port_security; j++) {
659 if (!extract_lsp_addresses(nbs->port_security[j],
660 &op->ps_addrs[op->n_ps_addrs])) {
661 static struct vlog_rate_limit rl
662 = VLOG_RATE_LIMIT_INIT(1, 1);
663 VLOG_INFO_RL(&rl, "invalid syntax '%s' in port "
664 "security. No MAC address found",
665 op->nbs->port_security[j]);
674 for (size_t i = 0; i < od->nbr->n_ports; i++) {
675 const struct nbrec_logical_router_port *nbr = od->nbr->ports[i];
677 struct lport_addresses lrp_networks;
678 if (!extract_lrp_networks(nbr, &lrp_networks)) {
679 static struct vlog_rate_limit rl
680 = VLOG_RATE_LIMIT_INIT(5, 1);
681 VLOG_WARN_RL(&rl, "bad 'mac' %s", nbr->mac);
685 if (!lrp_networks.n_ipv4_addrs && !lrp_networks.n_ipv6_addrs) {
689 struct ovn_port *op = ovn_port_find(ports, nbr->name);
691 if (op->nbs || op->nbr) {
692 static struct vlog_rate_limit rl
693 = VLOG_RATE_LIMIT_INIT(5, 1);
694 VLOG_WARN_RL(&rl, "duplicate logical router port %s",
699 ovs_list_remove(&op->list);
700 ovs_list_push_back(both, &op->list);
702 /* This port exists but should not have been
703 * initialized fully. */
704 ovs_assert(!op->lrp_networks.n_ipv4_addrs
705 && !op->lrp_networks.n_ipv6_addrs);
707 op = ovn_port_create(ports, nbr->name, NULL, nbr, NULL);
708 ovs_list_push_back(nb_only, &op->list);
711 op->lrp_networks = lrp_networks;
717 /* Connect logical router ports, and logical switch ports of type "router",
720 HMAP_FOR_EACH (op, key_node, ports) {
721 if (op->nbs && !strcmp(op->nbs->type, "router")) {
722 const char *peer_name = smap_get(&op->nbs->options, "router-port");
727 struct ovn_port *peer = ovn_port_find(ports, peer_name);
728 if (!peer || !peer->nbr) {
734 op->od->router_ports = xrealloc(
735 op->od->router_ports,
736 sizeof *op->od->router_ports * (op->od->n_router_ports + 1));
737 op->od->router_ports[op->od->n_router_ports++] = op;
738 } else if (op->nbr && op->nbr->peer) {
739 op->peer = ovn_port_find(ports, op->nbr->peer);
745 ovn_port_update_sbrec(const struct ovn_port *op)
747 sbrec_port_binding_set_datapath(op->sb, op->od->sb);
749 /* If the router is for l3 gateway, it resides on a chassis
750 * and its port type is "gateway". */
751 const char *chassis = smap_get(&op->od->nbr->options, "chassis");
753 sbrec_port_binding_set_type(op->sb, "gateway");
755 sbrec_port_binding_set_type(op->sb, "patch");
758 const char *peer = op->peer ? op->peer->key : "<error>";
761 smap_add(&new, "peer", peer);
763 smap_add(&new, "gateway-chassis", chassis);
765 sbrec_port_binding_set_options(op->sb, &new);
768 sbrec_port_binding_set_parent_port(op->sb, NULL);
769 sbrec_port_binding_set_tag(op->sb, NULL, 0);
770 sbrec_port_binding_set_mac(op->sb, NULL, 0);
772 if (strcmp(op->nbs->type, "router")) {
773 sbrec_port_binding_set_type(op->sb, op->nbs->type);
774 sbrec_port_binding_set_options(op->sb, &op->nbs->options);
776 const char *chassis = NULL;
777 if (op->peer && op->peer->od && op->peer->od->nbr) {
778 chassis = smap_get(&op->peer->od->nbr->options, "chassis");
781 /* A switch port connected to a gateway router is also of
784 sbrec_port_binding_set_type(op->sb, "gateway");
786 sbrec_port_binding_set_type(op->sb, "patch");
789 const char *router_port = smap_get(&op->nbs->options,
792 router_port = "<error>";
796 smap_add(&new, "peer", router_port);
798 smap_add(&new, "gateway-chassis", chassis);
800 sbrec_port_binding_set_options(op->sb, &new);
803 sbrec_port_binding_set_parent_port(op->sb, op->nbs->parent_name);
804 sbrec_port_binding_set_tag(op->sb, op->nbs->tag, op->nbs->n_tag);
805 sbrec_port_binding_set_mac(op->sb, (const char **) op->nbs->addresses,
806 op->nbs->n_addresses);
810 /* Updates the southbound Port_Binding table so that it contains the logical
811 * switch ports specified by the northbound database.
813 * Initializes 'ports' to contain a "struct ovn_port" for every logical port,
814 * using the "struct ovn_datapath"s in 'datapaths' to look up logical
817 build_ports(struct northd_context *ctx, struct hmap *datapaths,
820 struct ovs_list sb_only, nb_only, both;
822 join_logical_ports(ctx, datapaths, ports, &sb_only, &nb_only, &both);
824 /* For logical ports that are in both databases, update the southbound
825 * record based on northbound data. Also index the in-use tunnel_keys. */
826 struct ovn_port *op, *next;
827 LIST_FOR_EACH_SAFE (op, next, list, &both) {
828 ovn_port_update_sbrec(op);
830 add_tnlid(&op->od->port_tnlids, op->sb->tunnel_key);
831 if (op->sb->tunnel_key > op->od->port_key_hint) {
832 op->od->port_key_hint = op->sb->tunnel_key;
836 /* Add southbound record for each unmatched northbound record. */
837 LIST_FOR_EACH_SAFE (op, next, list, &nb_only) {
838 uint16_t tunnel_key = ovn_port_allocate_key(op->od);
843 op->sb = sbrec_port_binding_insert(ctx->ovnsb_txn);
844 ovn_port_update_sbrec(op);
846 sbrec_port_binding_set_logical_port(op->sb, op->key);
847 sbrec_port_binding_set_tunnel_key(op->sb, tunnel_key);
850 /* Delete southbound records without northbound matches. */
851 LIST_FOR_EACH_SAFE(op, next, list, &sb_only) {
852 ovs_list_remove(&op->list);
853 sbrec_port_binding_delete(op->sb);
854 ovn_port_destroy(ports, op);
858 #define OVN_MIN_MULTICAST 32768
859 #define OVN_MAX_MULTICAST 65535
861 struct multicast_group {
863 uint16_t key; /* OVN_MIN_MULTICAST...OVN_MAX_MULTICAST. */
866 #define MC_FLOOD "_MC_flood"
867 static const struct multicast_group mc_flood = { MC_FLOOD, 65535 };
869 #define MC_UNKNOWN "_MC_unknown"
870 static const struct multicast_group mc_unknown = { MC_UNKNOWN, 65534 };
873 multicast_group_equal(const struct multicast_group *a,
874 const struct multicast_group *b)
876 return !strcmp(a->name, b->name) && a->key == b->key;
879 /* Multicast group entry. */
880 struct ovn_multicast {
881 struct hmap_node hmap_node; /* Index on 'datapath' and 'key'. */
882 struct ovn_datapath *datapath;
883 const struct multicast_group *group;
885 struct ovn_port **ports;
886 size_t n_ports, allocated_ports;
890 ovn_multicast_hash(const struct ovn_datapath *datapath,
891 const struct multicast_group *group)
893 return hash_pointer(datapath, group->key);
896 static struct ovn_multicast *
897 ovn_multicast_find(struct hmap *mcgroups, struct ovn_datapath *datapath,
898 const struct multicast_group *group)
900 struct ovn_multicast *mc;
902 HMAP_FOR_EACH_WITH_HASH (mc, hmap_node,
903 ovn_multicast_hash(datapath, group), mcgroups) {
904 if (mc->datapath == datapath
905 && multicast_group_equal(mc->group, group)) {
913 ovn_multicast_add(struct hmap *mcgroups, const struct multicast_group *group,
914 struct ovn_port *port)
916 struct ovn_datapath *od = port->od;
917 struct ovn_multicast *mc = ovn_multicast_find(mcgroups, od, group);
919 mc = xmalloc(sizeof *mc);
920 hmap_insert(mcgroups, &mc->hmap_node, ovn_multicast_hash(od, group));
924 mc->allocated_ports = 4;
925 mc->ports = xmalloc(mc->allocated_ports * sizeof *mc->ports);
927 if (mc->n_ports >= mc->allocated_ports) {
928 mc->ports = x2nrealloc(mc->ports, &mc->allocated_ports,
931 mc->ports[mc->n_ports++] = port;
935 ovn_multicast_destroy(struct hmap *mcgroups, struct ovn_multicast *mc)
938 hmap_remove(mcgroups, &mc->hmap_node);
945 ovn_multicast_update_sbrec(const struct ovn_multicast *mc,
946 const struct sbrec_multicast_group *sb)
948 struct sbrec_port_binding **ports = xmalloc(mc->n_ports * sizeof *ports);
949 for (size_t i = 0; i < mc->n_ports; i++) {
950 ports[i] = CONST_CAST(struct sbrec_port_binding *, mc->ports[i]->sb);
952 sbrec_multicast_group_set_ports(sb, ports, mc->n_ports);
956 /* Logical flow generation.
958 * This code generates the Logical_Flow table in the southbound database, as a
959 * function of most of the northbound database.
963 struct hmap_node hmap_node;
965 struct ovn_datapath *od;
966 enum ovn_stage stage;
973 ovn_lflow_hash(const struct ovn_lflow *lflow)
975 size_t hash = uuid_hash(&lflow->od->key);
976 hash = hash_2words((lflow->stage << 16) | lflow->priority, hash);
977 hash = hash_string(lflow->match, hash);
978 return hash_string(lflow->actions, hash);
982 ovn_lflow_equal(const struct ovn_lflow *a, const struct ovn_lflow *b)
984 return (a->od == b->od
985 && a->stage == b->stage
986 && a->priority == b->priority
987 && !strcmp(a->match, b->match)
988 && !strcmp(a->actions, b->actions));
992 ovn_lflow_init(struct ovn_lflow *lflow, struct ovn_datapath *od,
993 enum ovn_stage stage, uint16_t priority,
994 char *match, char *actions)
997 lflow->stage = stage;
998 lflow->priority = priority;
999 lflow->match = match;
1000 lflow->actions = actions;
1003 /* Adds a row with the specified contents to the Logical_Flow table. */
1005 ovn_lflow_add(struct hmap *lflow_map, struct ovn_datapath *od,
1006 enum ovn_stage stage, uint16_t priority,
1007 const char *match, const char *actions)
1009 ovs_assert(ovn_stage_to_datapath_type(stage) == ovn_datapath_get_type(od));
1011 struct ovn_lflow *lflow = xmalloc(sizeof *lflow);
1012 ovn_lflow_init(lflow, od, stage, priority,
1013 xstrdup(match), xstrdup(actions));
1014 hmap_insert(lflow_map, &lflow->hmap_node, ovn_lflow_hash(lflow));
1017 static struct ovn_lflow *
1018 ovn_lflow_find(struct hmap *lflows, struct ovn_datapath *od,
1019 enum ovn_stage stage, uint16_t priority,
1020 const char *match, const char *actions)
1022 struct ovn_lflow target;
1023 ovn_lflow_init(&target, od, stage, priority,
1024 CONST_CAST(char *, match), CONST_CAST(char *, actions));
1026 struct ovn_lflow *lflow;
1027 HMAP_FOR_EACH_WITH_HASH (lflow, hmap_node, ovn_lflow_hash(&target),
1029 if (ovn_lflow_equal(lflow, &target)) {
1037 ovn_lflow_destroy(struct hmap *lflows, struct ovn_lflow *lflow)
1040 hmap_remove(lflows, &lflow->hmap_node);
1042 free(lflow->actions);
1047 /* Appends port security constraints on L2 address field 'eth_addr_field'
1048 * (e.g. "eth.src" or "eth.dst") to 'match'. 'ps_addrs', with 'n_ps_addrs'
1049 * elements, is the collection of port_security constraints from an
1050 * OVN_NB Logical_Switch_Port row generated by extract_lsp_addresses(). */
1052 build_port_security_l2(const char *eth_addr_field,
1053 struct lport_addresses *ps_addrs,
1054 unsigned int n_ps_addrs,
1061 ds_put_format(match, " && %s == {", eth_addr_field);
1063 for (size_t i = 0; i < n_ps_addrs; i++) {
1064 ds_put_format(match, "%s ", ps_addrs[i].ea_s);
1066 ds_chomp(match, ' ');
1067 ds_put_cstr(match, "}");
1071 build_port_security_ipv6_nd_flow(
1072 struct ds *match, struct eth_addr ea, struct ipv6_netaddr *ipv6_addrs,
1075 ds_put_format(match, " && ip6 && nd && ((nd.sll == "ETH_ADDR_FMT" || "
1076 "nd.sll == "ETH_ADDR_FMT") || ((nd.tll == "ETH_ADDR_FMT" || "
1077 "nd.tll == "ETH_ADDR_FMT")", ETH_ADDR_ARGS(eth_addr_zero),
1078 ETH_ADDR_ARGS(ea), ETH_ADDR_ARGS(eth_addr_zero),
1080 if (!n_ipv6_addrs) {
1081 ds_put_cstr(match, "))");
1085 char ip6_str[INET6_ADDRSTRLEN + 1];
1086 struct in6_addr lla;
1087 in6_generate_lla(ea, &lla);
1088 memset(ip6_str, 0, sizeof(ip6_str));
1089 ipv6_string_mapped(ip6_str, &lla);
1090 ds_put_format(match, " && (nd.target == %s", ip6_str);
1092 for(int i = 0; i < n_ipv6_addrs; i++) {
1093 memset(ip6_str, 0, sizeof(ip6_str));
1094 ipv6_string_mapped(ip6_str, &ipv6_addrs[i].addr);
1095 ds_put_format(match, " || nd.target == %s", ip6_str);
1098 ds_put_format(match, ")))");
1102 build_port_security_ipv6_flow(
1103 enum ovn_pipeline pipeline, struct ds *match, struct eth_addr ea,
1104 struct ipv6_netaddr *ipv6_addrs, int n_ipv6_addrs)
1106 char ip6_str[INET6_ADDRSTRLEN + 1];
1108 ds_put_format(match, " && %s == {",
1109 pipeline == P_IN ? "ip6.src" : "ip6.dst");
1111 /* Allow link-local address. */
1112 struct in6_addr lla;
1113 in6_generate_lla(ea, &lla);
1114 ipv6_string_mapped(ip6_str, &lla);
1115 ds_put_format(match, "%s, ", ip6_str);
1117 /* Allow ip6.dst=ff00::/8 for multicast packets */
1118 if (pipeline == P_OUT) {
1119 ds_put_cstr(match, "ff00::/8, ");
1121 for(int i = 0; i < n_ipv6_addrs; i++) {
1122 ipv6_string_mapped(ip6_str, &ipv6_addrs[i].addr);
1123 ds_put_format(match, "%s, ", ip6_str);
1125 /* Replace ", " by "}". */
1126 ds_chomp(match, ' ');
1127 ds_chomp(match, ',');
1128 ds_put_cstr(match, "}");
1132 * Build port security constraints on ARP and IPv6 ND fields
1133 * and add logical flows to S_SWITCH_IN_PORT_SEC_ND stage.
1135 * For each port security of the logical port, following
1136 * logical flows are added
1137 * - If the port security has no IP (both IPv4 and IPv6) or
1138 * if it has IPv4 address(es)
1139 * - Priority 90 flow to allow ARP packets for known MAC addresses
1140 * in the eth.src and arp.spa fields. If the port security
1141 * has IPv4 addresses, allow known IPv4 addresses in the arp.tpa field.
1143 * - If the port security has no IP (both IPv4 and IPv6) or
1144 * if it has IPv6 address(es)
1145 * - Priority 90 flow to allow IPv6 ND packets for known MAC addresses
1146 * in the eth.src and nd.sll/nd.tll fields. If the port security
1147 * has IPv6 addresses, allow known IPv6 addresses in the nd.target field
1148 * for IPv6 Neighbor Advertisement packet.
1150 * - Priority 80 flow to drop ARP and IPv6 ND packets.
1153 build_port_security_nd(struct ovn_port *op, struct hmap *lflows)
1155 struct ds match = DS_EMPTY_INITIALIZER;
1157 for (size_t i = 0; i < op->n_ps_addrs; i++) {
1158 struct lport_addresses *ps = &op->ps_addrs[i];
1160 bool no_ip = !(ps->n_ipv4_addrs || ps->n_ipv6_addrs);
1163 if (ps->n_ipv4_addrs || no_ip) {
1164 ds_put_format(&match,
1165 "inport == %s && eth.src == %s && arp.sha == %s",
1166 op->json_key, ps->ea_s, ps->ea_s);
1168 if (ps->n_ipv4_addrs) {
1169 ds_put_cstr(&match, " && arp.spa == {");
1170 for (size_t j = 0; j < ps->n_ipv4_addrs; j++) {
1171 /* When the netmask is applied, if the host portion is
1172 * non-zero, the host can only use the specified
1173 * address in the arp.spa. If zero, the host is allowed
1174 * to use any address in the subnet. */
1175 if (ps->ipv4_addrs[j].plen == 32
1176 || ps->ipv4_addrs[j].addr & ~ps->ipv4_addrs[j].mask) {
1177 ds_put_cstr(&match, ps->ipv4_addrs[j].addr_s);
1179 ds_put_format(&match, "%s/%d",
1180 ps->ipv4_addrs[j].network_s,
1181 ps->ipv4_addrs[j].plen);
1183 ds_put_cstr(&match, ", ");
1185 ds_chomp(&match, ' ');
1186 ds_chomp(&match, ',');
1187 ds_put_cstr(&match, "}");
1189 ovn_lflow_add(lflows, op->od, S_SWITCH_IN_PORT_SEC_ND, 90,
1190 ds_cstr(&match), "next;");
1193 if (ps->n_ipv6_addrs || no_ip) {
1195 ds_put_format(&match, "inport == %s && eth.src == %s",
1196 op->json_key, ps->ea_s);
1197 build_port_security_ipv6_nd_flow(&match, ps->ea, ps->ipv6_addrs,
1199 ovn_lflow_add(lflows, op->od, S_SWITCH_IN_PORT_SEC_ND, 90,
1200 ds_cstr(&match), "next;");
1205 ds_put_format(&match, "inport == %s && (arp || nd)", op->json_key);
1206 ovn_lflow_add(lflows, op->od, S_SWITCH_IN_PORT_SEC_ND, 80,
1207 ds_cstr(&match), "drop;");
1212 * Build port security constraints on IPv4 and IPv6 src and dst fields
1213 * and add logical flows to S_SWITCH_(IN/OUT)_PORT_SEC_IP stage.
1215 * For each port security of the logical port, following
1216 * logical flows are added
1217 * - If the port security has IPv4 addresses,
1218 * - Priority 90 flow to allow IPv4 packets for known IPv4 addresses
1220 * - If the port security has IPv6 addresses,
1221 * - Priority 90 flow to allow IPv6 packets for known IPv6 addresses
1223 * - If the port security has IPv4 addresses or IPv6 addresses or both
1224 * - Priority 80 flow to drop all IPv4 and IPv6 traffic
1227 build_port_security_ip(enum ovn_pipeline pipeline, struct ovn_port *op,
1228 struct hmap *lflows)
1230 char *port_direction;
1231 enum ovn_stage stage;
1232 if (pipeline == P_IN) {
1233 port_direction = "inport";
1234 stage = S_SWITCH_IN_PORT_SEC_IP;
1236 port_direction = "outport";
1237 stage = S_SWITCH_OUT_PORT_SEC_IP;
1240 for (size_t i = 0; i < op->n_ps_addrs; i++) {
1241 struct lport_addresses *ps = &op->ps_addrs[i];
1243 if (!(ps->n_ipv4_addrs || ps->n_ipv6_addrs)) {
1247 if (ps->n_ipv4_addrs) {
1248 struct ds match = DS_EMPTY_INITIALIZER;
1249 if (pipeline == P_IN) {
1250 /* Permit use of the unspecified address for DHCP discovery */
1251 struct ds dhcp_match = DS_EMPTY_INITIALIZER;
1252 ds_put_format(&dhcp_match, "inport == %s"
1254 " && ip4.src == 0.0.0.0"
1255 " && ip4.dst == 255.255.255.255"
1256 " && udp.src == 68 && udp.dst == 67",
1257 op->json_key, ps->ea_s);
1258 ovn_lflow_add(lflows, op->od, stage, 90,
1259 ds_cstr(&dhcp_match), "next;");
1260 ds_destroy(&dhcp_match);
1261 ds_put_format(&match, "inport == %s && eth.src == %s"
1262 " && ip4.src == {", op->json_key,
1265 ds_put_format(&match, "outport == %s && eth.dst == %s"
1266 " && ip4.dst == {255.255.255.255, 224.0.0.0/4, ",
1267 op->json_key, ps->ea_s);
1270 for (int j = 0; j < ps->n_ipv4_addrs; j++) {
1271 ovs_be32 mask = ps->ipv4_addrs[j].mask;
1272 /* When the netmask is applied, if the host portion is
1273 * non-zero, the host can only use the specified
1274 * address. If zero, the host is allowed to use any
1275 * address in the subnet.
1277 if (ps->ipv4_addrs[j].plen == 32
1278 || ps->ipv4_addrs[j].addr & ~mask) {
1279 ds_put_format(&match, "%s", ps->ipv4_addrs[j].addr_s);
1280 if (pipeline == P_OUT && ps->ipv4_addrs[j].plen != 32) {
1281 /* Host is also allowed to receive packets to the
1282 * broadcast address in the specified subnet. */
1283 ds_put_format(&match, ", %s",
1284 ps->ipv4_addrs[j].bcast_s);
1287 /* host portion is zero */
1288 ds_put_format(&match, "%s/%d", ps->ipv4_addrs[j].network_s,
1289 ps->ipv4_addrs[j].plen);
1291 ds_put_cstr(&match, ", ");
1294 /* Replace ", " by "}". */
1295 ds_chomp(&match, ' ');
1296 ds_chomp(&match, ',');
1297 ds_put_cstr(&match, "}");
1298 ovn_lflow_add(lflows, op->od, stage, 90, ds_cstr(&match), "next;");
1302 if (ps->n_ipv6_addrs) {
1303 struct ds match = DS_EMPTY_INITIALIZER;
1304 if (pipeline == P_IN) {
1305 /* Permit use of unspecified address for duplicate address
1307 struct ds dad_match = DS_EMPTY_INITIALIZER;
1308 ds_put_format(&dad_match, "inport == %s"
1311 " && ip6.dst == ff02::/16"
1312 " && icmp6.type == {131, 135, 143}", op->json_key,
1314 ovn_lflow_add(lflows, op->od, stage, 90,
1315 ds_cstr(&dad_match), "next;");
1316 ds_destroy(&dad_match);
1318 ds_put_format(&match, "%s == %s && %s == %s",
1319 port_direction, op->json_key,
1320 pipeline == P_IN ? "eth.src" : "eth.dst", ps->ea_s);
1321 build_port_security_ipv6_flow(pipeline, &match, ps->ea,
1322 ps->ipv6_addrs, ps->n_ipv6_addrs);
1323 ovn_lflow_add(lflows, op->od, stage, 90,
1324 ds_cstr(&match), "next;");
1328 char *match = xasprintf("%s == %s && %s == %s && ip",
1329 port_direction, op->json_key,
1330 pipeline == P_IN ? "eth.src" : "eth.dst",
1332 ovn_lflow_add(lflows, op->od, stage, 80, match, "drop;");
1339 lsp_is_enabled(const struct nbrec_logical_switch_port *lsp)
1341 return !lsp->enabled || *lsp->enabled;
1345 lsp_is_up(const struct nbrec_logical_switch_port *lsp)
1347 return !lsp->up || *lsp->up;
1351 has_stateful_acl(struct ovn_datapath *od)
1353 for (size_t i = 0; i < od->nbs->n_acls; i++) {
1354 struct nbrec_acl *acl = od->nbs->acls[i];
1355 if (!strcmp(acl->action, "allow-related")) {
1364 build_pre_acls(struct ovn_datapath *od, struct hmap *lflows,
1367 bool has_stateful = has_stateful_acl(od);
1368 struct ovn_port *op;
1370 /* Ingress and Egress Pre-ACL Table (Priority 0): Packets are
1371 * allowed by default. */
1372 ovn_lflow_add(lflows, od, S_SWITCH_IN_PRE_ACL, 0, "1", "next;");
1373 ovn_lflow_add(lflows, od, S_SWITCH_OUT_PRE_ACL, 0, "1", "next;");
1375 /* If there are any stateful ACL rules in this dapapath, we must
1376 * send all IP packets through the conntrack action, which handles
1377 * defragmentation, in order to match L4 headers. */
1379 HMAP_FOR_EACH (op, key_node, ports) {
1380 if (op->od == od && !strcmp(op->nbs->type, "router")) {
1381 /* Can't use ct() for router ports. Consider the
1382 * following configuration: lp1(10.0.0.2) on
1383 * hostA--ls1--lr0--ls2--lp2(10.0.1.2) on hostB, For a
1384 * ping from lp1 to lp2, First, the response will go
1385 * through ct() with a zone for lp2 in the ls2 ingress
1386 * pipeline on hostB. That ct zone knows about this
1387 * connection. Next, it goes through ct() with the zone
1388 * for the router port in the egress pipeline of ls2 on
1389 * hostB. This zone does not know about the connection,
1390 * as the icmp request went through the logical router
1391 * on hostA, not hostB. This would only work with
1392 * distributed conntrack state across all chassis. */
1393 struct ds match_in = DS_EMPTY_INITIALIZER;
1394 struct ds match_out = DS_EMPTY_INITIALIZER;
1396 ds_put_format(&match_in, "ip && inport == %s", op->json_key);
1397 ds_put_format(&match_out, "ip && outport == %s", op->json_key);
1398 ovn_lflow_add(lflows, od, S_SWITCH_IN_PRE_ACL, 110,
1399 ds_cstr(&match_in), "next;");
1400 ovn_lflow_add(lflows, od, S_SWITCH_OUT_PRE_ACL, 110,
1401 ds_cstr(&match_out), "next;");
1403 ds_destroy(&match_in);
1404 ds_destroy(&match_out);
1407 /* Ingress and Egress Pre-ACL Table (Priority 110).
1409 * Not to do conntrack on ND packets. */
1410 ovn_lflow_add(lflows, od, S_SWITCH_IN_PRE_ACL, 110, "nd", "next;");
1411 ovn_lflow_add(lflows, od, S_SWITCH_OUT_PRE_ACL, 110, "nd", "next;");
1413 /* Ingress and Egress Pre-ACL Table (Priority 100).
1415 * Regardless of whether the ACL is "from-lport" or "to-lport",
1416 * we need rules in both the ingress and egress table, because
1417 * the return traffic needs to be followed.
1419 * 'REGBIT_CONNTRACK_DEFRAG' is set to let the pre-stateful table send
1420 * it to conntrack for tracking and defragmentation. */
1421 ovn_lflow_add(lflows, od, S_SWITCH_IN_PRE_ACL, 100, "ip",
1422 REGBIT_CONNTRACK_DEFRAG" = 1; next;");
1423 ovn_lflow_add(lflows, od, S_SWITCH_OUT_PRE_ACL, 100, "ip",
1424 REGBIT_CONNTRACK_DEFRAG" = 1; next;");
1428 /* For a 'key' of the form "IP:port" or just "IP", sets 'port' and
1429 * 'ip_address'. The caller must free() the memory allocated for
1432 ip_address_and_port_from_lb_key(const char *key, char **ip_address,
1435 char *ip_str, *start, *next;
1439 next = start = xstrdup(key);
1440 ip_str = strsep(&next, ":");
1441 if (!ip_str || !ip_str[0]) {
1442 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
1443 VLOG_WARN_RL(&rl, "bad ip address for load balancer key %s", key);
1449 char *error = ip_parse_masked(ip_str, &ip, &mask);
1450 if (error || mask != OVS_BE32_MAX) {
1451 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
1452 VLOG_WARN_RL(&rl, "bad ip address for load balancer key %s", key);
1459 if (next && next[0]) {
1460 if (!str_to_int(next, 0, &l4_port) || l4_port < 0 || l4_port > 65535) {
1461 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
1462 VLOG_WARN_RL(&rl, "bad ip port for load balancer key %s", key);
1469 *ip_address = strdup(ip_str);
1474 build_pre_lb(struct ovn_datapath *od, struct hmap *lflows)
1476 /* Allow all packets to go to next tables by default. */
1477 ovn_lflow_add(lflows, od, S_SWITCH_IN_PRE_LB, 0, "1", "next;");
1478 ovn_lflow_add(lflows, od, S_SWITCH_OUT_PRE_LB, 0, "1", "next;");
1480 struct sset all_ips = SSET_INITIALIZER(&all_ips);
1481 if (od->nbs->load_balancer) {
1482 struct nbrec_load_balancer *lb = od->nbs->load_balancer;
1483 struct smap *vips = &lb->vips;
1484 struct smap_node *node;
1485 bool vip_configured = false;
1487 SMAP_FOR_EACH (node, vips) {
1488 vip_configured = true;
1490 /* node->key contains IP:port or just IP. */
1491 char *ip_address = NULL;
1493 ip_address_and_port_from_lb_key(node->key, &ip_address, &port);
1498 if (!sset_contains(&all_ips, ip_address)) {
1499 sset_add(&all_ips, ip_address);
1504 /* Ignore L4 port information in the key because fragmented packets
1505 * may not have L4 information. The pre-stateful table will send
1506 * the packet through ct() action to de-fragment. In stateful
1507 * table, we will eventually look at L4 information. */
1510 /* 'REGBIT_CONNTRACK_DEFRAG' is set to let the pre-stateful table send
1511 * packet to conntrack for defragmentation. */
1512 const char *ip_address;
1513 SSET_FOR_EACH(ip_address, &all_ips) {
1514 char *match = xasprintf("ip && ip4.dst == %s", ip_address);
1515 ovn_lflow_add(lflows, od, S_SWITCH_IN_PRE_LB,
1516 100, match, REGBIT_CONNTRACK_DEFRAG" = 1; next;");
1520 sset_destroy(&all_ips);
1522 if (vip_configured) {
1523 ovn_lflow_add(lflows, od, S_SWITCH_OUT_PRE_LB,
1524 100, "ip", REGBIT_CONNTRACK_DEFRAG" = 1; next;");
1530 build_pre_stateful(struct ovn_datapath *od, struct hmap *lflows)
1532 /* Ingress and Egress pre-stateful Table (Priority 0): Packets are
1533 * allowed by default. */
1534 ovn_lflow_add(lflows, od, S_SWITCH_IN_PRE_STATEFUL, 0, "1", "next;");
1535 ovn_lflow_add(lflows, od, S_SWITCH_OUT_PRE_STATEFUL, 0, "1", "next;");
1537 /* If REGBIT_CONNTRACK_DEFRAG is set as 1, then the packets should be
1538 * sent to conntrack for tracking and defragmentation. */
1539 ovn_lflow_add(lflows, od, S_SWITCH_IN_PRE_STATEFUL, 100,
1540 REGBIT_CONNTRACK_DEFRAG" == 1", "ct_next;");
1541 ovn_lflow_add(lflows, od, S_SWITCH_OUT_PRE_STATEFUL, 100,
1542 REGBIT_CONNTRACK_DEFRAG" == 1", "ct_next;");
1546 build_acls(struct ovn_datapath *od, struct hmap *lflows)
1548 bool has_stateful = has_stateful_acl(od);
1550 /* Ingress and Egress ACL Table (Priority 0): Packets are allowed by
1551 * default. A related rule at priority 1 is added below if there
1552 * are any stateful ACLs in this datapath. */
1553 ovn_lflow_add(lflows, od, S_SWITCH_IN_ACL, 0, "1", "next;");
1554 ovn_lflow_add(lflows, od, S_SWITCH_OUT_ACL, 0, "1", "next;");
1557 /* Ingress and Egress ACL Table (Priority 1).
1559 * By default, traffic is allowed. This is partially handled by
1560 * the Priority 0 ACL flows added earlier, but we also need to
1561 * commit IP flows. This is because, while the initiater's
1562 * direction may not have any stateful rules, the server's may
1563 * and then its return traffic would not have an associated
1564 * conntrack entry and would return "+invalid".
1566 * We use "ct_commit" for a connection that is not already known
1567 * by the connection tracker. Once a connection is committed,
1568 * subsequent packets will hit the flow at priority 0 that just
1571 * We also check for established connections that have ct_label[0]
1572 * set on them. That's a connection that was disallowed, but is
1573 * now allowed by policy again since it hit this default-allow flow.
1574 * We need to set ct_label[0]=0 to let the connection continue,
1575 * which will be done by ct_commit() in the "stateful" stage.
1576 * Subsequent packets will hit the flow at priority 0 that just
1578 ovn_lflow_add(lflows, od, S_SWITCH_IN_ACL, 1,
1579 "ip && (!ct.est || (ct.est && ct_label[0] == 1))",
1580 REGBIT_CONNTRACK_COMMIT" = 1; next;");
1581 ovn_lflow_add(lflows, od, S_SWITCH_OUT_ACL, 1,
1582 "ip && (!ct.est || (ct.est && ct_label[0] == 1))",
1583 REGBIT_CONNTRACK_COMMIT" = 1; next;");
1585 /* Ingress and Egress ACL Table (Priority 65535).
1587 * Always drop traffic that's in an invalid state. Also drop
1588 * reply direction packets for connections that have been marked
1589 * for deletion (bit 0 of ct_label is set).
1591 * This is enforced at a higher priority than ACLs can be defined. */
1592 ovn_lflow_add(lflows, od, S_SWITCH_IN_ACL, UINT16_MAX,
1593 "ct.inv || (ct.est && ct.rpl && ct_label[0] == 1)",
1595 ovn_lflow_add(lflows, od, S_SWITCH_OUT_ACL, UINT16_MAX,
1596 "ct.inv || (ct.est && ct.rpl && ct_label[0] == 1)",
1599 /* Ingress and Egress ACL Table (Priority 65535).
1601 * Allow reply traffic that is part of an established
1602 * conntrack entry that has not been marked for deletion
1603 * (bit 0 of ct_label). We only match traffic in the
1604 * reply direction because we want traffic in the request
1605 * direction to hit the currently defined policy from ACLs.
1607 * This is enforced at a higher priority than ACLs can be defined. */
1608 ovn_lflow_add(lflows, od, S_SWITCH_IN_ACL, UINT16_MAX,
1609 "ct.est && !ct.rel && !ct.new && !ct.inv "
1610 "&& ct.rpl && ct_label[0] == 0",
1612 ovn_lflow_add(lflows, od, S_SWITCH_OUT_ACL, UINT16_MAX,
1613 "ct.est && !ct.rel && !ct.new && !ct.inv "
1614 "&& ct.rpl && ct_label[0] == 0",
1617 /* Ingress and Egress ACL Table (Priority 65535).
1619 * Allow traffic that is related to an existing conntrack entry that
1620 * has not been marked for deletion (bit 0 of ct_label).
1622 * This is enforced at a higher priority than ACLs can be defined.
1624 * NOTE: This does not support related data sessions (eg,
1625 * a dynamically negotiated FTP data channel), but will allow
1626 * related traffic such as an ICMP Port Unreachable through
1627 * that's generated from a non-listening UDP port. */
1628 ovn_lflow_add(lflows, od, S_SWITCH_IN_ACL, UINT16_MAX,
1629 "!ct.est && ct.rel && !ct.new && !ct.inv "
1630 "&& ct_label[0] == 0",
1632 ovn_lflow_add(lflows, od, S_SWITCH_OUT_ACL, UINT16_MAX,
1633 "!ct.est && ct.rel && !ct.new && !ct.inv "
1634 "&& ct_label[0] == 0",
1637 /* Ingress and Egress ACL Table (Priority 65535).
1639 * Not to do conntrack on ND packets. */
1640 ovn_lflow_add(lflows, od, S_SWITCH_IN_ACL, UINT16_MAX, "nd", "next;");
1641 ovn_lflow_add(lflows, od, S_SWITCH_OUT_ACL, UINT16_MAX, "nd", "next;");
1644 /* Ingress or Egress ACL Table (Various priorities). */
1645 for (size_t i = 0; i < od->nbs->n_acls; i++) {
1646 struct nbrec_acl *acl = od->nbs->acls[i];
1647 bool ingress = !strcmp(acl->direction, "from-lport") ? true :false;
1648 enum ovn_stage stage = ingress ? S_SWITCH_IN_ACL : S_SWITCH_OUT_ACL;
1650 if (!strcmp(acl->action, "allow")
1651 || !strcmp(acl->action, "allow-related")) {
1652 /* If there are any stateful flows, we must even commit "allow"
1653 * actions. This is because, while the initiater's
1654 * direction may not have any stateful rules, the server's
1655 * may and then its return traffic would not have an
1656 * associated conntrack entry and would return "+invalid". */
1657 if (!has_stateful) {
1658 ovn_lflow_add(lflows, od, stage,
1659 acl->priority + OVN_ACL_PRI_OFFSET,
1660 acl->match, "next;");
1662 struct ds match = DS_EMPTY_INITIALIZER;
1664 /* Commit the connection tracking entry if it's a new
1665 * connection that matches this ACL. After this commit,
1666 * the reply traffic is allowed by a flow we create at
1667 * priority 65535, defined earlier.
1669 * It's also possible that a known connection was marked for
1670 * deletion after a policy was deleted, but the policy was
1671 * re-added while that connection is still known. We catch
1672 * that case here and un-set ct_label[0] (which will be done
1673 * by ct_commit in the "stateful" stage) to indicate that the
1674 * connection should be allowed to resume.
1676 ds_put_format(&match, "((ct.new && !ct.est)"
1677 " || (!ct.new && ct.est && !ct.rpl "
1678 "&& ct_label[0] == 1)) "
1679 "&& (%s)", acl->match);
1680 ovn_lflow_add(lflows, od, stage,
1681 acl->priority + OVN_ACL_PRI_OFFSET,
1683 REGBIT_CONNTRACK_COMMIT" = 1; next;");
1685 /* Match on traffic in the request direction for an established
1686 * connection tracking entry that has not been marked for
1687 * deletion. There is no need to commit here, so we can just
1688 * proceed to the next table. We use this to ensure that this
1689 * connection is still allowed by the currently defined
1692 ds_put_format(&match,
1693 "!ct.new && ct.est && !ct.rpl"
1694 " && ct_label[0] == 0 && (%s)",
1696 ovn_lflow_add(lflows, od, stage,
1697 acl->priority + OVN_ACL_PRI_OFFSET,
1698 ds_cstr(&match), "next;");
1702 } else if (!strcmp(acl->action, "drop")
1703 || !strcmp(acl->action, "reject")) {
1704 struct ds match = DS_EMPTY_INITIALIZER;
1706 /* XXX Need to support "reject", treat it as "drop;" for now. */
1707 if (!strcmp(acl->action, "reject")) {
1708 VLOG_INFO("reject is not a supported action");
1711 /* The implementation of "drop" differs if stateful ACLs are in
1712 * use for this datapath. In that case, the actions differ
1713 * depending on whether the connection was previously committed
1714 * to the connection tracker with ct_commit. */
1716 /* If the packet is not part of an established connection, then
1717 * we can simply drop it. */
1718 ds_put_format(&match,
1719 "(!ct.est || (ct.est && ct_label[0] == 1)) "
1722 ovn_lflow_add(lflows, od, stage, acl->priority +
1723 OVN_ACL_PRI_OFFSET, ds_cstr(&match), "drop;");
1725 /* For an existing connection without ct_label set, we've
1726 * encountered a policy change. ACLs previously allowed
1727 * this connection and we committed the connection tracking
1728 * entry. Current policy says that we should drop this
1729 * connection. First, we set bit 0 of ct_label to indicate
1730 * that this connection is set for deletion. By not
1731 * specifying "next;", we implicitly drop the packet after
1732 * updating conntrack state. We would normally defer
1733 * ct_commit() to the "stateful" stage, but since we're
1734 * dropping the packet, we go ahead and do it here. */
1736 ds_put_format(&match,
1737 "ct.est && ct_label[0] == 0 && (%s)",
1739 ovn_lflow_add(lflows, od, stage,
1740 acl->priority + OVN_ACL_PRI_OFFSET,
1741 ds_cstr(&match), "ct_commit(ct_label=1/1);");
1745 /* There are no stateful ACLs in use on this datapath,
1746 * so a "drop" ACL is simply the "drop" logical flow action
1748 ovn_lflow_add(lflows, od, stage,
1749 acl->priority + OVN_ACL_PRI_OFFSET,
1750 acl->match, "drop;");
1757 build_lb(struct ovn_datapath *od, struct hmap *lflows)
1759 /* Ingress and Egress LB Table (Priority 0): Packets are allowed by
1761 ovn_lflow_add(lflows, od, S_SWITCH_IN_LB, 0, "1", "next;");
1762 ovn_lflow_add(lflows, od, S_SWITCH_OUT_LB, 0, "1", "next;");
1764 if (od->nbs->load_balancer) {
1765 /* Ingress and Egress LB Table (Priority 65535).
1767 * Send established traffic through conntrack for just NAT. */
1768 ovn_lflow_add(lflows, od, S_SWITCH_IN_LB, UINT16_MAX,
1769 "ct.est && !ct.rel && !ct.new && !ct.inv",
1770 REGBIT_CONNTRACK_NAT" = 1; next;");
1771 ovn_lflow_add(lflows, od, S_SWITCH_OUT_LB, UINT16_MAX,
1772 "ct.est && !ct.rel && !ct.new && !ct.inv",
1773 REGBIT_CONNTRACK_NAT" = 1; next;");
1778 build_stateful(struct ovn_datapath *od, struct hmap *lflows)
1780 /* Ingress and Egress stateful Table (Priority 0): Packets are
1781 * allowed by default. */
1782 ovn_lflow_add(lflows, od, S_SWITCH_IN_STATEFUL, 0, "1", "next;");
1783 ovn_lflow_add(lflows, od, S_SWITCH_OUT_STATEFUL, 0, "1", "next;");
1785 /* If REGBIT_CONNTRACK_COMMIT is set as 1, then the packets should be
1786 * committed to conntrack. We always set ct_label[0] to 0 here as
1787 * any packet that makes it this far is part of a connection we
1788 * want to allow to continue. */
1789 ovn_lflow_add(lflows, od, S_SWITCH_IN_STATEFUL, 100,
1790 REGBIT_CONNTRACK_COMMIT" == 1", "ct_commit(ct_label=0/1); next;");
1791 ovn_lflow_add(lflows, od, S_SWITCH_OUT_STATEFUL, 100,
1792 REGBIT_CONNTRACK_COMMIT" == 1", "ct_commit(ct_label=0/1); next;");
1794 /* If REGBIT_CONNTRACK_NAT is set as 1, then packets should just be sent
1795 * through nat (without committing).
1797 * REGBIT_CONNTRACK_COMMIT is set for new connections and
1798 * REGBIT_CONNTRACK_NAT is set for established connections. So they
1801 ovn_lflow_add(lflows, od, S_SWITCH_IN_STATEFUL, 100,
1802 REGBIT_CONNTRACK_NAT" == 1", "ct_lb;");
1803 ovn_lflow_add(lflows, od, S_SWITCH_OUT_STATEFUL, 100,
1804 REGBIT_CONNTRACK_NAT" == 1", "ct_lb;");
1806 /* Load balancing rules for new connections get committed to conntrack
1807 * table. So even if REGBIT_CONNTRACK_COMMIT is set in a previous table
1808 * a higher priority rule for load balancing below also commits the
1809 * connection, so it is okay if we do not hit the above match on
1810 * REGBIT_CONNTRACK_COMMIT. */
1811 if (od->nbs->load_balancer) {
1812 struct nbrec_load_balancer *lb = od->nbs->load_balancer;
1813 struct smap *vips = &lb->vips;
1814 struct smap_node *node;
1816 SMAP_FOR_EACH (node, vips) {
1819 /* node->key contains IP:port or just IP. */
1820 char *ip_address = NULL;
1821 ip_address_and_port_from_lb_key(node->key, &ip_address, &port);
1826 /* New connections in Ingress table. */
1827 char *action = xasprintf("ct_lb(%s);", node->value);
1828 struct ds match = DS_EMPTY_INITIALIZER;
1829 ds_put_format(&match, "ct.new && ip && ip4.dst == %s", ip_address);
1831 if (lb->protocol && !strcmp(lb->protocol, "udp")) {
1832 ds_put_format(&match, "&& udp && udp.dst == %d", port);
1834 ds_put_format(&match, "&& tcp && tcp.dst == %d", port);
1836 ovn_lflow_add(lflows, od, S_SWITCH_IN_STATEFUL,
1837 120, ds_cstr(&match), action);
1839 ovn_lflow_add(lflows, od, S_SWITCH_IN_STATEFUL,
1840 110, ds_cstr(&match), action);
1850 build_lswitch_flows(struct hmap *datapaths, struct hmap *ports,
1851 struct hmap *lflows, struct hmap *mcgroups)
1853 /* This flow table structure is documented in ovn-northd(8), so please
1854 * update ovn-northd.8.xml if you change anything. */
1856 struct ds match = DS_EMPTY_INITIALIZER;
1857 struct ds actions = DS_EMPTY_INITIALIZER;
1859 /* Build pre-ACL and ACL tables for both ingress and egress.
1860 * Ingress tables 3 and 4. Egress tables 0 and 1. */
1861 struct ovn_datapath *od;
1862 HMAP_FOR_EACH (od, key_node, datapaths) {
1867 build_pre_acls(od, lflows, ports);
1868 build_pre_lb(od, lflows);
1869 build_pre_stateful(od, lflows);
1870 build_acls(od, lflows);
1871 build_lb(od, lflows);
1872 build_stateful(od, lflows);
1875 /* Logical switch ingress table 0: Admission control framework (priority
1877 HMAP_FOR_EACH (od, key_node, datapaths) {
1882 /* Logical VLANs not supported. */
1883 ovn_lflow_add(lflows, od, S_SWITCH_IN_PORT_SEC_L2, 100, "vlan.present",
1886 /* Broadcast/multicast source address is invalid. */
1887 ovn_lflow_add(lflows, od, S_SWITCH_IN_PORT_SEC_L2, 100, "eth.src[40]",
1890 /* Port security flows have priority 50 (see below) and will continue
1891 * to the next table if packet source is acceptable. */
1894 /* Logical switch ingress table 0: Ingress port security - L2
1896 * Ingress table 1: Ingress port security - IP (priority 90 and 80)
1897 * Ingress table 2: Ingress port security - ND (priority 90 and 80)
1899 struct ovn_port *op;
1900 HMAP_FOR_EACH (op, key_node, ports) {
1905 if (!lsp_is_enabled(op->nbs)) {
1906 /* Drop packets from disabled logical ports (since logical flow
1907 * tables are default-drop). */
1912 ds_put_format(&match, "inport == %s", op->json_key);
1913 build_port_security_l2("eth.src", op->ps_addrs, op->n_ps_addrs,
1915 ovn_lflow_add(lflows, op->od, S_SWITCH_IN_PORT_SEC_L2, 50,
1916 ds_cstr(&match), "next;");
1918 if (op->nbs->n_port_security) {
1919 build_port_security_ip(P_IN, op, lflows);
1920 build_port_security_nd(op, lflows);
1924 /* Ingress table 1 and 2: Port security - IP and ND, by default goto next.
1926 HMAP_FOR_EACH (od, key_node, datapaths) {
1931 ovn_lflow_add(lflows, od, S_SWITCH_IN_PORT_SEC_ND, 0, "1", "next;");
1932 ovn_lflow_add(lflows, od, S_SWITCH_IN_PORT_SEC_IP, 0, "1", "next;");
1935 /* Ingress table 9: ARP responder, skip requests coming from localnet ports.
1936 * (priority 100). */
1937 HMAP_FOR_EACH (op, key_node, ports) {
1942 if (!strcmp(op->nbs->type, "localnet")) {
1944 ds_put_format(&match, "inport == %s", op->json_key);
1945 ovn_lflow_add(lflows, op->od, S_SWITCH_IN_ARP_ND_RSP, 100,
1946 ds_cstr(&match), "next;");
1950 /* Ingress table 9: ARP/ND responder, reply for known IPs.
1952 HMAP_FOR_EACH (op, key_node, ports) {
1958 * Add ARP/ND reply flows if either the
1960 * - port type is router
1962 if (!lsp_is_up(op->nbs) && strcmp(op->nbs->type, "router")) {
1966 for (size_t i = 0; i < op->n_lsp_addrs; i++) {
1967 for (size_t j = 0; j < op->lsp_addrs[i].n_ipv4_addrs; j++) {
1969 ds_put_format(&match, "arp.tpa == %s && arp.op == 1",
1970 op->lsp_addrs[i].ipv4_addrs[j].addr_s);
1972 ds_put_format(&actions,
1973 "eth.dst = eth.src; "
1975 "arp.op = 2; /* ARP reply */ "
1976 "arp.tha = arp.sha; "
1978 "arp.tpa = arp.spa; "
1980 "outport = inport; "
1981 "inport = \"\"; /* Allow sending out inport. */ "
1983 op->lsp_addrs[i].ea_s, op->lsp_addrs[i].ea_s,
1984 op->lsp_addrs[i].ipv4_addrs[j].addr_s);
1985 ovn_lflow_add(lflows, op->od, S_SWITCH_IN_ARP_ND_RSP, 50,
1986 ds_cstr(&match), ds_cstr(&actions));
1989 if (op->lsp_addrs[i].n_ipv6_addrs > 0) {
1991 ds_put_cstr(&match, "icmp6 && icmp6.type == 135 && ");
1992 if (op->lsp_addrs[i].n_ipv6_addrs == 1) {
1993 ds_put_format(&match, "nd.target == %s",
1994 op->lsp_addrs[i].ipv6_addrs[0].addr_s);
1996 ds_put_format(&match, "nd.target == {");
1997 for (size_t j = 0; j < op->lsp_addrs[i].n_ipv6_addrs; j++) {
1999 op->lsp_addrs[i].ipv6_addrs[j].addr_s);
2001 ds_chomp(&match, ' ');
2002 ds_chomp(&match, ',');
2003 ds_put_cstr(&match, "}");
2006 ds_put_format(&actions,
2007 "na { eth.src = %s; "
2009 "outport = inport; "
2010 "inport = \"\"; /* Allow sending out inport. */ "
2012 op->lsp_addrs[i].ea_s,
2013 op->lsp_addrs[i].ea_s);
2015 ovn_lflow_add(lflows, op->od, S_SWITCH_IN_ARP_ND_RSP, 50,
2016 ds_cstr(&match), ds_cstr(&actions));
2022 /* Ingress table 9: ARP/ND responder, by default goto next.
2024 HMAP_FOR_EACH (od, key_node, datapaths) {
2029 ovn_lflow_add(lflows, od, S_SWITCH_IN_ARP_ND_RSP, 0, "1", "next;");
2032 /* Ingress table 10: Destination lookup, broadcast and multicast handling
2033 * (priority 100). */
2034 HMAP_FOR_EACH (op, key_node, ports) {
2039 if (lsp_is_enabled(op->nbs)) {
2040 ovn_multicast_add(mcgroups, &mc_flood, op);
2043 HMAP_FOR_EACH (od, key_node, datapaths) {
2048 ovn_lflow_add(lflows, od, S_SWITCH_IN_L2_LKUP, 100, "eth.mcast",
2049 "outport = \""MC_FLOOD"\"; output;");
2052 /* Ingress table 10: Destination lookup, unicast handling (priority 50), */
2053 HMAP_FOR_EACH (op, key_node, ports) {
2058 for (size_t i = 0; i < op->nbs->n_addresses; i++) {
2059 struct eth_addr mac;
2061 if (eth_addr_from_string(op->nbs->addresses[i], &mac)) {
2063 ds_put_format(&match, "eth.dst == "ETH_ADDR_FMT,
2064 ETH_ADDR_ARGS(mac));
2067 ds_put_format(&actions, "outport = %s; output;", op->json_key);
2068 ovn_lflow_add(lflows, op->od, S_SWITCH_IN_L2_LKUP, 50,
2069 ds_cstr(&match), ds_cstr(&actions));
2070 } else if (!strcmp(op->nbs->addresses[i], "unknown")) {
2071 if (lsp_is_enabled(op->nbs)) {
2072 ovn_multicast_add(mcgroups, &mc_unknown, op);
2073 op->od->has_unknown = true;
2076 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
2079 "%s: invalid syntax '%s' in addresses column",
2080 op->nbs->name, op->nbs->addresses[i]);
2085 /* Ingress table 10: Destination lookup for unknown MACs (priority 0). */
2086 HMAP_FOR_EACH (od, key_node, datapaths) {
2091 if (od->has_unknown) {
2092 ovn_lflow_add(lflows, od, S_SWITCH_IN_L2_LKUP, 0, "1",
2093 "outport = \""MC_UNKNOWN"\"; output;");
2097 /* Egress tables 6: Egress port security - IP (priority 0)
2098 * Egress table 7: Egress port security L2 - multicast/broadcast
2099 * (priority 100). */
2100 HMAP_FOR_EACH (od, key_node, datapaths) {
2105 ovn_lflow_add(lflows, od, S_SWITCH_OUT_PORT_SEC_IP, 0, "1", "next;");
2106 ovn_lflow_add(lflows, od, S_SWITCH_OUT_PORT_SEC_L2, 100, "eth.mcast",
2110 /* Egress table 6: Egress port security - IP (priorities 90 and 80)
2111 * if port security enabled.
2113 * Egress table 7: Egress port security - L2 (priorities 50 and 150).
2115 * Priority 50 rules implement port security for enabled logical port.
2117 * Priority 150 rules drop packets to disabled logical ports, so that they
2118 * don't even receive multicast or broadcast packets. */
2119 HMAP_FOR_EACH (op, key_node, ports) {
2125 ds_put_format(&match, "outport == %s", op->json_key);
2126 if (lsp_is_enabled(op->nbs)) {
2127 build_port_security_l2("eth.dst", op->ps_addrs, op->n_ps_addrs,
2129 ovn_lflow_add(lflows, op->od, S_SWITCH_OUT_PORT_SEC_L2, 50,
2130 ds_cstr(&match), "output;");
2132 ovn_lflow_add(lflows, op->od, S_SWITCH_OUT_PORT_SEC_L2, 150,
2133 ds_cstr(&match), "drop;");
2136 if (op->nbs->n_port_security) {
2137 build_port_security_ip(P_OUT, op, lflows);
2142 ds_destroy(&actions);
2146 lrport_is_enabled(const struct nbrec_logical_router_port *lrport)
2148 return !lrport->enabled || *lrport->enabled;
2151 /* Returns a string of the IP address of the router port 'op' that
2152 * overlaps with 'ip_s". If one is not found, returns NULL.
2154 * The caller must not free the returned string. */
2156 find_lrp_member_ip(const struct ovn_port *op, const char *ip_s)
2160 if (!ip_parse(ip_s, &ip)) {
2161 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
2162 VLOG_WARN_RL(&rl, "bad ip address %s", ip_s);
2166 for (int i = 0; i < op->lrp_networks.n_ipv4_addrs; i++) {
2167 const struct ipv4_netaddr *na = &op->lrp_networks.ipv4_addrs[i];
2169 if (!((na->network ^ ip) & na->mask)) {
2170 /* There should be only 1 interface that matches the
2171 * next hop. Otherwise, it's a configuration error,
2172 * because subnets of router's interfaces should NOT
2182 add_route(struct hmap *lflows, const struct ovn_port *op,
2183 const char *lrp_addr_s, const char *network_s, int plen,
2184 const char *gateway)
2186 char *match = xasprintf("ip4.dst == %s/%d", network_s, plen);
2188 struct ds actions = DS_EMPTY_INITIALIZER;
2189 ds_put_cstr(&actions, "ip.ttl--; reg0 = ");
2191 ds_put_cstr(&actions, gateway);
2193 ds_put_cstr(&actions, "ip4.dst");
2195 ds_put_format(&actions, "; "
2199 "inport = \"\"; /* Allow sending out inport. */ "
2202 op->lrp_networks.ea_s,
2205 /* The priority here is calculated to implement longest-prefix-match
2207 ovn_lflow_add(lflows, op->od, S_ROUTER_IN_IP_ROUTING, plen, match,
2209 ds_destroy(&actions);
2214 build_static_route_flow(struct hmap *lflows, struct ovn_datapath *od,
2216 const struct nbrec_logical_router_static_route *route)
2218 ovs_be32 prefix, nexthop, mask;
2219 const char *lrp_addr_s;
2221 /* Verify that next hop is an IP address with 32 bits mask. */
2222 char *error = ip_parse_masked(route->nexthop, &nexthop, &mask);
2223 if (error || mask != OVS_BE32_MAX) {
2224 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
2225 VLOG_WARN_RL(&rl, "bad next hop ip address %s", route->nexthop);
2230 /* Verify that ip prefix is a valid CIDR address. */
2231 error = ip_parse_masked(route->ip_prefix, &prefix, &mask);
2232 if (error || !ip_is_cidr(mask)) {
2233 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
2234 VLOG_WARN_RL(&rl, "bad 'ip_prefix' in static routes %s",
2240 /* Find the outgoing port. */
2241 struct ovn_port *out_port = NULL;
2242 if (route->output_port) {
2243 out_port = ovn_port_find(ports, route->output_port);
2245 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
2246 VLOG_WARN_RL(&rl, "Bad out port %s for static route %s",
2247 route->output_port, route->ip_prefix);
2250 lrp_addr_s = find_lrp_member_ip(out_port, route->nexthop);
2252 /* output_port is not specified, find the
2253 * router port matching the next hop. */
2255 for (i = 0; i < od->nbr->n_ports; i++) {
2256 struct nbrec_logical_router_port *lrp = od->nbr->ports[i];
2257 out_port = ovn_port_find(ports, lrp->name);
2259 /* This should not happen. */
2263 lrp_addr_s = find_lrp_member_ip(out_port, route->nexthop);
2271 /* There is no matched out port. */
2272 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
2273 VLOG_WARN_RL(&rl, "No path for static route %s; next hop %s",
2274 route->ip_prefix, route->nexthop);
2278 char *prefix_s = xasprintf(IP_FMT, IP_ARGS(prefix & mask));
2279 add_route(lflows, out_port, lrp_addr_s, prefix_s,
2280 ip_count_cidr_bits(mask), route->nexthop);
2285 op_put_networks(struct ds *ds, const struct ovn_port *op, bool add_bcast)
2287 if (!add_bcast && op->lrp_networks.n_ipv4_addrs == 1) {
2288 ds_put_format(ds, "%s", op->lrp_networks.ipv4_addrs[0].addr_s);
2292 ds_put_cstr(ds, "{");
2293 for (int i = 0; i < op->lrp_networks.n_ipv4_addrs; i++) {
2294 ds_put_format(ds, "%s, ", op->lrp_networks.ipv4_addrs[i].addr_s);
2296 ds_put_format(ds, "%s, ", op->lrp_networks.ipv4_addrs[i].bcast_s);
2301 ds_put_cstr(ds, "}");
2305 build_lrouter_flows(struct hmap *datapaths, struct hmap *ports,
2306 struct hmap *lflows)
2308 /* This flow table structure is documented in ovn-northd(8), so please
2309 * update ovn-northd.8.xml if you change anything. */
2311 struct ds match = DS_EMPTY_INITIALIZER;
2312 struct ds actions = DS_EMPTY_INITIALIZER;
2314 /* Logical router ingress table 0: Admission control framework. */
2315 struct ovn_datapath *od;
2316 HMAP_FOR_EACH (od, key_node, datapaths) {
2321 /* Logical VLANs not supported.
2322 * Broadcast/multicast source address is invalid. */
2323 ovn_lflow_add(lflows, od, S_ROUTER_IN_ADMISSION, 100,
2324 "vlan.present || eth.src[40]", "drop;");
2327 /* Logical router ingress table 0: match (priority 50). */
2328 struct ovn_port *op;
2329 HMAP_FOR_EACH (op, key_node, ports) {
2334 if (!lrport_is_enabled(op->nbr)) {
2335 /* Drop packets from disabled logical ports (since logical flow
2336 * tables are default-drop). */
2341 ds_put_format(&match, "(eth.mcast || eth.dst == %s) && inport == %s",
2342 op->lrp_networks.ea_s, op->json_key);
2343 ovn_lflow_add(lflows, op->od, S_ROUTER_IN_ADMISSION, 50,
2344 ds_cstr(&match), "next;");
2347 /* Logical router ingress table 1: IP Input. */
2348 HMAP_FOR_EACH (od, key_node, datapaths) {
2353 /* L3 admission control: drop multicast and broadcast source, localhost
2354 * source or destination, and zero network source or destination
2355 * (priority 100). */
2356 ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_INPUT, 100,
2358 "ip4.src == 255.255.255.255 || "
2359 "ip4.src == 127.0.0.0/8 || "
2360 "ip4.dst == 127.0.0.0/8 || "
2361 "ip4.src == 0.0.0.0/8 || "
2362 "ip4.dst == 0.0.0.0/8",
2365 /* ARP reply handling. Use ARP replies to populate the logical
2366 * router's ARP table. */
2367 ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_INPUT, 90, "arp.op == 2",
2368 "put_arp(inport, arp.spa, arp.sha);");
2370 /* Drop Ethernet local broadcast. By definition this traffic should
2371 * not be forwarded.*/
2372 ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_INPUT, 50,
2373 "eth.bcast", "drop;");
2377 * XXX Need to send ICMP time exceeded if !ip.later_frag. */
2379 ds_put_cstr(&match, "ip4 && ip.ttl == {0, 1}");
2380 ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_INPUT, 30,
2381 ds_cstr(&match), "drop;");
2383 /* Pass other traffic not already handled to the next table for
2385 ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_INPUT, 0, "1", "next;");
2388 HMAP_FOR_EACH (op, key_node, ports) {
2393 /* L3 admission control: drop packets that originate from an IP address
2394 * owned by the router or a broadcast address known to the router
2395 * (priority 100). */
2397 ds_put_cstr(&match, "ip4.src == ");
2398 op_put_networks(&match, op, true);
2399 ovn_lflow_add(lflows, op->od, S_ROUTER_IN_IP_INPUT, 100,
2400 ds_cstr(&match), "drop;");
2402 /* ICMP echo reply. These flows reply to ICMP echo requests
2403 * received for the router's IP address. Since packets only
2404 * get here as part of the logical router datapath, the inport
2405 * (i.e. the incoming locally attached net) does not matter.
2406 * The ip.ttl also does not matter (RFC1812 section 4.2.2.9) */
2408 ds_put_cstr(&match, "ip4.dst == ");
2409 op_put_networks(&match, op, false);
2410 ds_put_cstr(&match, " && icmp4.type == 8 && icmp4.code == 0");
2413 ds_put_format(&actions,
2414 "ip4.dst <-> ip4.src; "
2417 "inport = \"\"; /* Allow sending out inport. */ "
2419 ovn_lflow_add(lflows, op->od, S_ROUTER_IN_IP_INPUT, 90,
2420 ds_cstr(&match), ds_cstr(&actions));
2422 /* ARP reply. These flows reply to ARP requests for the router's own
2424 for (int i = 0; i < op->lrp_networks.n_ipv4_addrs; i++) {
2426 ds_put_format(&match,
2427 "inport == %s && arp.tpa == %s && arp.op == 1",
2428 op->json_key, op->lrp_networks.ipv4_addrs[i].addr_s);
2431 ds_put_format(&actions,
2432 "eth.dst = eth.src; "
2434 "arp.op = 2; /* ARP reply */ "
2435 "arp.tha = arp.sha; "
2437 "arp.tpa = arp.spa; "
2440 "inport = \"\"; /* Allow sending out inport. */ "
2442 op->lrp_networks.ea_s,
2443 op->lrp_networks.ea_s,
2444 op->lrp_networks.ipv4_addrs[i].addr_s,
2446 ovn_lflow_add(lflows, op->od, S_ROUTER_IN_IP_INPUT, 90,
2447 ds_cstr(&match), ds_cstr(&actions));
2450 /* ARP handling for external IP addresses.
2452 * DNAT IP addresses are external IP addresses that need ARP
2454 for (int i = 0; i < op->od->nbr->n_nat; i++) {
2455 const struct nbrec_nat *nat;
2457 nat = op->od->nbr->nat[i];
2459 if(!strcmp(nat->type, "snat")) {
2464 if (!ip_parse(nat->external_ip, &ip) || !ip) {
2465 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
2466 VLOG_WARN_RL(&rl, "bad ip address %s in dnat configuration "
2467 "for router %s", nat->external_ip, op->key);
2472 ds_put_format(&match,
2473 "inport == %s && arp.tpa == "IP_FMT" && arp.op == 1",
2474 op->json_key, IP_ARGS(ip));
2477 ds_put_format(&actions,
2478 "eth.dst = eth.src; "
2480 "arp.op = 2; /* ARP reply */ "
2481 "arp.tha = arp.sha; "
2483 "arp.tpa = arp.spa; "
2484 "arp.spa = "IP_FMT"; "
2486 "inport = \"\"; /* Allow sending out inport. */ "
2488 op->lrp_networks.ea_s,
2489 op->lrp_networks.ea_s,
2492 ovn_lflow_add(lflows, op->od, S_ROUTER_IN_IP_INPUT, 90,
2493 ds_cstr(&match), ds_cstr(&actions));
2496 /* Drop IP traffic to this router, unless the router ip is used as
2498 ovs_be32 *nat_ips = xmalloc(sizeof *nat_ips * op->od->nbr->n_nat);
2499 size_t n_nat_ips = 0;
2500 for (int i = 0; i < op->od->nbr->n_nat; i++) {
2501 const struct nbrec_nat *nat;
2504 nat = op->od->nbr->nat[i];
2505 if (strcmp(nat->type, "snat")) {
2509 if (!ip_parse(nat->external_ip, &ip) || !ip) {
2510 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
2511 VLOG_WARN_RL(&rl, "bad ip address %s in snat configuration "
2512 "for router %s", nat->external_ip, op->key);
2516 nat_ips[n_nat_ips++] = ip;
2520 ds_put_cstr(&match, "ip4.dst == {");
2521 bool has_drop_ips = false;
2522 for (int i = 0; i < op->lrp_networks.n_ipv4_addrs; i++) {
2523 for (int j = 0; j < n_nat_ips; j++) {
2524 if (op->lrp_networks.ipv4_addrs[i].addr == nat_ips[j]) {
2528 ds_put_format(&match, "%s, ",
2529 op->lrp_networks.ipv4_addrs[i].addr_s);
2530 has_drop_ips = true;
2532 ds_chomp(&match, ' ');
2533 ds_chomp(&match, ',');
2534 ds_put_cstr(&match, "}");
2537 /* Drop IP traffic to this router. */
2538 ovn_lflow_add(lflows, op->od, S_ROUTER_IN_IP_INPUT, 60,
2539 ds_cstr(&match), "drop;");
2545 /* NAT in Gateway routers. */
2546 HMAP_FOR_EACH (od, key_node, datapaths) {
2551 /* Packets are allowed by default. */
2552 ovn_lflow_add(lflows, od, S_ROUTER_IN_UNSNAT, 0, "1", "next;");
2553 ovn_lflow_add(lflows, od, S_ROUTER_OUT_SNAT, 0, "1", "next;");
2554 ovn_lflow_add(lflows, od, S_ROUTER_IN_DNAT, 0, "1", "next;");
2556 /* NAT rules are only valid on Gateway routers. */
2557 if (!smap_get(&od->nbr->options, "chassis")) {
2561 for (int i = 0; i < od->nbr->n_nat; i++) {
2562 const struct nbrec_nat *nat;
2564 nat = od->nbr->nat[i];
2568 char *error = ip_parse_masked(nat->external_ip, &ip, &mask);
2569 if (error || mask != OVS_BE32_MAX) {
2570 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
2571 VLOG_WARN_RL(&rl, "bad external ip %s for nat",
2577 /* Check the validity of nat->logical_ip. 'logical_ip' can
2578 * be a subnet when the type is "snat". */
2579 error = ip_parse_masked(nat->logical_ip, &ip, &mask);
2580 if (!strcmp(nat->type, "snat")) {
2582 static struct vlog_rate_limit rl =
2583 VLOG_RATE_LIMIT_INIT(5, 1);
2584 VLOG_WARN_RL(&rl, "bad ip network or ip %s for snat "
2585 "in router "UUID_FMT"",
2586 nat->logical_ip, UUID_ARGS(&od->key));
2591 if (error || mask != OVS_BE32_MAX) {
2592 static struct vlog_rate_limit rl =
2593 VLOG_RATE_LIMIT_INIT(5, 1);
2594 VLOG_WARN_RL(&rl, "bad ip %s for dnat in router "
2595 ""UUID_FMT"", nat->logical_ip, UUID_ARGS(&od->key));
2601 /* Ingress UNSNAT table: It is for already established connections'
2602 * reverse traffic. i.e., SNAT has already been done in egress
2603 * pipeline and now the packet has entered the ingress pipeline as
2604 * part of a reply. We undo the SNAT here.
2606 * Undoing SNAT has to happen before DNAT processing. This is
2607 * because when the packet was DNATed in ingress pipeline, it did
2608 * not know about the possibility of eventual additional SNAT in
2609 * egress pipeline. */
2610 if (!strcmp(nat->type, "snat")
2611 || !strcmp(nat->type, "dnat_and_snat")) {
2613 ds_put_format(&match, "ip && ip4.dst == %s", nat->external_ip);
2614 ovn_lflow_add(lflows, od, S_ROUTER_IN_UNSNAT, 100,
2615 ds_cstr(&match), "ct_snat; next;");
2618 /* Ingress DNAT table: Packets enter the pipeline with destination
2619 * IP address that needs to be DNATted from a external IP address
2620 * to a logical IP address. */
2621 if (!strcmp(nat->type, "dnat")
2622 || !strcmp(nat->type, "dnat_and_snat")) {
2623 /* Packet when it goes from the initiator to destination.
2624 * We need to zero the inport because the router can
2625 * send the packet back through the same interface. */
2627 ds_put_format(&match, "ip && ip4.dst == %s", nat->external_ip);
2629 ds_put_format(&actions,"inport = \"\"; ct_dnat(%s);",
2631 ovn_lflow_add(lflows, od, S_ROUTER_IN_DNAT, 100,
2632 ds_cstr(&match), ds_cstr(&actions));
2635 /* Egress SNAT table: Packets enter the egress pipeline with
2636 * source ip address that needs to be SNATted to a external ip
2638 if (!strcmp(nat->type, "snat")
2639 || !strcmp(nat->type, "dnat_and_snat")) {
2641 ds_put_format(&match, "ip && ip4.src == %s", nat->logical_ip);
2643 ds_put_format(&actions, "ct_snat(%s);", nat->external_ip);
2645 /* The priority here is calculated such that the
2646 * nat->logical_ip with the longest mask gets a higher
2648 ovn_lflow_add(lflows, od, S_ROUTER_OUT_SNAT,
2649 count_1bits(ntohl(mask)) + 1,
2650 ds_cstr(&match), ds_cstr(&actions));
2654 /* Re-circulate every packet through the DNAT zone.
2655 * This helps with two things.
2657 * 1. Any packet that needs to be unDNATed in the reverse
2658 * direction gets unDNATed. Ideally this could be done in
2659 * the egress pipeline. But since the gateway router
2660 * does not have any feature that depends on the source
2661 * ip address being external IP address for IP routing,
2662 * we can do it here, saving a future re-circulation.
2664 * 2. Any packet that was sent through SNAT zone in the
2665 * previous table automatically gets re-circulated to get
2666 * back the new destination IP address that is needed for
2667 * routing in the openflow pipeline. */
2668 ovn_lflow_add(lflows, od, S_ROUTER_IN_DNAT, 50,
2669 "ip", "inport = \"\"; ct_dnat;");
2672 /* Logical router ingress table 4: IP Routing.
2674 * A packet that arrives at this table is an IP packet that should be
2675 * routed to the address in ip4.dst. This table sets outport to the correct
2676 * output port, eth.src to the output port's MAC address, and reg0 to the
2677 * next-hop IP address (leaving ip4.dst, the packet’s final destination,
2678 * unchanged), and advances to the next table for ARP resolution. */
2679 HMAP_FOR_EACH (op, key_node, ports) {
2684 for (int i = 0; i < op->lrp_networks.n_ipv4_addrs; i++) {
2685 add_route(lflows, op, op->lrp_networks.ipv4_addrs[i].addr_s,
2686 op->lrp_networks.ipv4_addrs[i].network_s,
2687 op->lrp_networks.ipv4_addrs[i].plen, NULL);
2691 HMAP_FOR_EACH (od, key_node, datapaths) {
2696 /* Convert the static routes to flows. */
2697 for (int i = 0; i < od->nbr->n_static_routes; i++) {
2698 const struct nbrec_logical_router_static_route *route;
2700 route = od->nbr->static_routes[i];
2701 build_static_route_flow(lflows, od, ports, route);
2704 /* XXX destination unreachable */
2706 /* Local router ingress table 5: ARP Resolution.
2708 * Any packet that reaches this table is an IP packet whose next-hop IP
2709 * address is in reg0. (ip4.dst is the final destination.) This table
2710 * resolves the IP address in reg0 into an output port in outport and an
2711 * Ethernet address in eth.dst. */
2712 HMAP_FOR_EACH (op, key_node, ports) {
2714 /* This is a logical router port. If next-hop IP address in 'reg0'
2715 * matches ip address of this router port, then the packet is
2716 * intended to eventually be sent to this logical port. Set the
2717 * destination mac address using this port's mac address.
2719 * The packet is still in peer's logical pipeline. So the match
2720 * should be on peer's outport. */
2721 if (op->nbr->peer) {
2722 struct ovn_port *peer = ovn_port_find(ports, op->nbr->peer);
2728 ds_put_format(&match, "outport == %s && reg0 == ",
2730 op_put_networks(&match, op, false);
2733 ds_put_format(&actions, "eth.dst = %s; next;",
2734 op->lrp_networks.ea_s);
2735 ovn_lflow_add(lflows, peer->od, S_ROUTER_IN_ARP_RESOLVE,
2736 100, ds_cstr(&match), ds_cstr(&actions));
2738 } else if (op->od->n_router_ports && strcmp(op->nbs->type, "router")) {
2739 /* This is a logical switch port that backs a VM or a container.
2740 * Extract its addresses. For each of the address, go through all
2741 * the router ports attached to the switch (to which this port
2742 * connects) and if the address in question is reachable from the
2743 * router port, add an ARP entry in that router's pipeline. */
2745 for (size_t i = 0; i < op->n_lsp_addrs; i++) {
2746 const char *ea_s = op->lsp_addrs[i].ea_s;
2747 for (size_t j = 0; j < op->lsp_addrs[i].n_ipv4_addrs; j++) {
2748 const char *ip_s = op->lsp_addrs[i].ipv4_addrs[j].addr_s;
2749 for (size_t k = 0; k < op->od->n_router_ports; k++) {
2750 /* Get the Logical_Router_Port that the
2751 * Logical_Switch_Port is connected to, as
2753 const char *peer_name = smap_get(
2754 &op->od->router_ports[k]->nbs->options,
2760 struct ovn_port *peer = ovn_port_find(ports, peer_name);
2761 if (!peer || !peer->nbr) {
2765 if (!find_lrp_member_ip(peer, ip_s)) {
2770 ds_put_format(&match, "outport == %s && reg0 == %s",
2771 peer->json_key, ip_s);
2774 ds_put_format(&actions, "eth.dst = %s; next;", ea_s);
2775 ovn_lflow_add(lflows, peer->od,
2776 S_ROUTER_IN_ARP_RESOLVE, 100,
2777 ds_cstr(&match), ds_cstr(&actions));
2781 } else if (!strcmp(op->nbs->type, "router")) {
2782 /* This is a logical switch port that connects to a router. */
2784 /* The peer of this switch port is the router port for which
2785 * we need to add logical flows such that it can resolve
2786 * ARP entries for all the other router ports connected to
2787 * the switch in question. */
2789 const char *peer_name = smap_get(&op->nbs->options,
2795 struct ovn_port *peer = ovn_port_find(ports, peer_name);
2796 if (!peer || !peer->nbr) {
2800 for (size_t i = 0; i < op->od->n_router_ports; i++) {
2801 const char *router_port_name = smap_get(
2802 &op->od->router_ports[i]->nbs->options,
2804 struct ovn_port *router_port = ovn_port_find(ports,
2806 if (!router_port || !router_port->nbr) {
2810 /* Skip the router port under consideration. */
2811 if (router_port == peer) {
2816 ds_put_format(&match, "outport == %s && reg0 == ",
2818 op_put_networks(&match, router_port, false);
2821 ds_put_format(&actions, "eth.dst = %s; next;",
2822 router_port->lrp_networks.ea_s);
2823 ovn_lflow_add(lflows, peer->od, S_ROUTER_IN_ARP_RESOLVE,
2824 100, ds_cstr(&match), ds_cstr(&actions));
2829 HMAP_FOR_EACH (od, key_node, datapaths) {
2834 ovn_lflow_add(lflows, od, S_ROUTER_IN_ARP_RESOLVE, 0, "1",
2835 "get_arp(outport, reg0); next;");
2838 /* Local router ingress table 6: ARP request.
2840 * In the common case where the Ethernet destination has been resolved,
2841 * this table outputs the packet (priority 0). Otherwise, it composes
2842 * and sends an ARP request (priority 100). */
2843 HMAP_FOR_EACH (od, key_node, datapaths) {
2848 ovn_lflow_add(lflows, od, S_ROUTER_IN_ARP_REQUEST, 100,
2849 "eth.dst == 00:00:00:00:00:00",
2851 "eth.dst = ff:ff:ff:ff:ff:ff; "
2853 "arp.op = 1; " /* ARP request */
2856 ovn_lflow_add(lflows, od, S_ROUTER_IN_ARP_REQUEST, 0, "1", "output;");
2859 /* Logical router egress table 1: Delivery (priority 100).
2861 * Priority 100 rules deliver packets to enabled logical ports. */
2862 HMAP_FOR_EACH (op, key_node, ports) {
2867 if (!lrport_is_enabled(op->nbr)) {
2868 /* Drop packets to disabled logical ports (since logical flow
2869 * tables are default-drop). */
2874 ds_put_format(&match, "outport == %s", op->json_key);
2875 ovn_lflow_add(lflows, op->od, S_ROUTER_OUT_DELIVERY, 100,
2876 ds_cstr(&match), "output;");
2880 ds_destroy(&actions);
2883 /* Updates the Logical_Flow and Multicast_Group tables in the OVN_SB database,
2884 * constructing their contents based on the OVN_NB database. */
2886 build_lflows(struct northd_context *ctx, struct hmap *datapaths,
2889 struct hmap lflows = HMAP_INITIALIZER(&lflows);
2890 struct hmap mcgroups = HMAP_INITIALIZER(&mcgroups);
2892 build_lswitch_flows(datapaths, ports, &lflows, &mcgroups);
2893 build_lrouter_flows(datapaths, ports, &lflows);
2895 /* Push changes to the Logical_Flow table to database. */
2896 const struct sbrec_logical_flow *sbflow, *next_sbflow;
2897 SBREC_LOGICAL_FLOW_FOR_EACH_SAFE (sbflow, next_sbflow, ctx->ovnsb_idl) {
2898 struct ovn_datapath *od
2899 = ovn_datapath_from_sbrec(datapaths, sbflow->logical_datapath);
2901 sbrec_logical_flow_delete(sbflow);
2905 enum ovn_datapath_type dp_type = od->nbs ? DP_SWITCH : DP_ROUTER;
2906 enum ovn_pipeline pipeline
2907 = !strcmp(sbflow->pipeline, "ingress") ? P_IN : P_OUT;
2908 struct ovn_lflow *lflow = ovn_lflow_find(
2909 &lflows, od, ovn_stage_build(dp_type, pipeline, sbflow->table_id),
2910 sbflow->priority, sbflow->match, sbflow->actions);
2912 ovn_lflow_destroy(&lflows, lflow);
2914 sbrec_logical_flow_delete(sbflow);
2917 struct ovn_lflow *lflow, *next_lflow;
2918 HMAP_FOR_EACH_SAFE (lflow, next_lflow, hmap_node, &lflows) {
2919 enum ovn_pipeline pipeline = ovn_stage_get_pipeline(lflow->stage);
2920 uint8_t table = ovn_stage_get_table(lflow->stage);
2922 sbflow = sbrec_logical_flow_insert(ctx->ovnsb_txn);
2923 sbrec_logical_flow_set_logical_datapath(sbflow, lflow->od->sb);
2924 sbrec_logical_flow_set_pipeline(
2925 sbflow, pipeline == P_IN ? "ingress" : "egress");
2926 sbrec_logical_flow_set_table_id(sbflow, table);
2927 sbrec_logical_flow_set_priority(sbflow, lflow->priority);
2928 sbrec_logical_flow_set_match(sbflow, lflow->match);
2929 sbrec_logical_flow_set_actions(sbflow, lflow->actions);
2931 const struct smap ids = SMAP_CONST1(&ids, "stage-name",
2932 ovn_stage_to_str(lflow->stage));
2933 sbrec_logical_flow_set_external_ids(sbflow, &ids);
2935 ovn_lflow_destroy(&lflows, lflow);
2937 hmap_destroy(&lflows);
2939 /* Push changes to the Multicast_Group table to database. */
2940 const struct sbrec_multicast_group *sbmc, *next_sbmc;
2941 SBREC_MULTICAST_GROUP_FOR_EACH_SAFE (sbmc, next_sbmc, ctx->ovnsb_idl) {
2942 struct ovn_datapath *od = ovn_datapath_from_sbrec(datapaths,
2945 sbrec_multicast_group_delete(sbmc);
2949 struct multicast_group group = { .name = sbmc->name,
2950 .key = sbmc->tunnel_key };
2951 struct ovn_multicast *mc = ovn_multicast_find(&mcgroups, od, &group);
2953 ovn_multicast_update_sbrec(mc, sbmc);
2954 ovn_multicast_destroy(&mcgroups, mc);
2956 sbrec_multicast_group_delete(sbmc);
2959 struct ovn_multicast *mc, *next_mc;
2960 HMAP_FOR_EACH_SAFE (mc, next_mc, hmap_node, &mcgroups) {
2961 sbmc = sbrec_multicast_group_insert(ctx->ovnsb_txn);
2962 sbrec_multicast_group_set_datapath(sbmc, mc->datapath->sb);
2963 sbrec_multicast_group_set_name(sbmc, mc->group->name);
2964 sbrec_multicast_group_set_tunnel_key(sbmc, mc->group->key);
2965 ovn_multicast_update_sbrec(mc, sbmc);
2966 ovn_multicast_destroy(&mcgroups, mc);
2968 hmap_destroy(&mcgroups);
2971 /* OVN_Northbound and OVN_Southbound have an identical Address_Set table.
2972 * We always update OVN_Southbound to match the current data in
2973 * OVN_Northbound, so that the address sets used in Logical_Flows in
2974 * OVN_Southbound is checked against the proper set.*/
2976 sync_address_sets(struct northd_context *ctx)
2978 struct shash sb_address_sets = SHASH_INITIALIZER(&sb_address_sets);
2980 const struct sbrec_address_set *sb_address_set;
2981 SBREC_ADDRESS_SET_FOR_EACH (sb_address_set, ctx->ovnsb_idl) {
2982 shash_add(&sb_address_sets, sb_address_set->name, sb_address_set);
2985 const struct nbrec_address_set *nb_address_set;
2986 NBREC_ADDRESS_SET_FOR_EACH (nb_address_set, ctx->ovnnb_idl) {
2987 sb_address_set = shash_find_and_delete(&sb_address_sets,
2988 nb_address_set->name);
2989 if (!sb_address_set) {
2990 sb_address_set = sbrec_address_set_insert(ctx->ovnsb_txn);
2991 sbrec_address_set_set_name(sb_address_set, nb_address_set->name);
2994 sbrec_address_set_set_addresses(sb_address_set,
2995 /* "char **" is not compatible with "const char **" */
2996 (const char **) nb_address_set->addresses,
2997 nb_address_set->n_addresses);
3000 struct shash_node *node, *next;
3001 SHASH_FOR_EACH_SAFE (node, next, &sb_address_sets) {
3002 sbrec_address_set_delete(node->data);
3003 shash_delete(&sb_address_sets, node);
3005 shash_destroy(&sb_address_sets);
3009 ovnnb_db_run(struct northd_context *ctx)
3011 if (!ctx->ovnsb_txn) {
3014 struct hmap datapaths, ports;
3015 build_datapaths(ctx, &datapaths);
3016 build_ports(ctx, &datapaths, &ports);
3017 build_lflows(ctx, &datapaths, &ports);
3019 sync_address_sets(ctx);
3021 struct ovn_datapath *dp, *next_dp;
3022 HMAP_FOR_EACH_SAFE (dp, next_dp, key_node, &datapaths) {
3023 ovn_datapath_destroy(&datapaths, dp);
3025 hmap_destroy(&datapaths);
3027 struct ovn_port *port, *next_port;
3028 HMAP_FOR_EACH_SAFE (port, next_port, key_node, &ports) {
3029 ovn_port_destroy(&ports, port);
3031 hmap_destroy(&ports);
3035 * The only change we get notified about is if the 'chassis' column of the
3036 * 'Port_Binding' table changes. When this column is not empty, it means we
3037 * need to set the corresponding logical port as 'up' in the northbound DB.
3040 ovnsb_db_run(struct northd_context *ctx)
3042 if (!ctx->ovnnb_txn) {
3045 struct hmap lports_hmap;
3046 const struct sbrec_port_binding *sb;
3047 const struct nbrec_logical_switch_port *nb;
3049 struct lport_hash_node {
3050 struct hmap_node node;
3051 const struct nbrec_logical_switch_port *nb;
3054 hmap_init(&lports_hmap);
3056 NBREC_LOGICAL_SWITCH_PORT_FOR_EACH(nb, ctx->ovnnb_idl) {
3057 hash_node = xzalloc(sizeof *hash_node);
3059 hmap_insert(&lports_hmap, &hash_node->node, hash_string(nb->name, 0));
3062 SBREC_PORT_BINDING_FOR_EACH(sb, ctx->ovnsb_idl) {
3064 HMAP_FOR_EACH_WITH_HASH(hash_node, node,
3065 hash_string(sb->logical_port, 0),
3067 if (!strcmp(sb->logical_port, hash_node->nb->name)) {
3074 /* The logical port doesn't exist for this port binding. This can
3075 * happen under normal circumstances when ovn-northd hasn't gotten
3076 * around to pruning the Port_Binding yet. */
3080 if (sb->chassis && (!nb->up || !*nb->up)) {
3082 nbrec_logical_switch_port_set_up(nb, &up, 1);
3083 } else if (!sb->chassis && (!nb->up || *nb->up)) {
3085 nbrec_logical_switch_port_set_up(nb, &up, 1);
3089 HMAP_FOR_EACH_POP(hash_node, node, &lports_hmap) {
3092 hmap_destroy(&lports_hmap);
3096 static char *default_nb_db_;
3101 if (!default_nb_db_) {
3102 default_nb_db_ = xasprintf("unix:%s/ovnnb_db.sock", ovs_rundir());
3104 return default_nb_db_;
3107 static char *default_sb_db_;
3112 if (!default_sb_db_) {
3113 default_sb_db_ = xasprintf("unix:%s/ovnsb_db.sock", ovs_rundir());
3115 return default_sb_db_;
3119 parse_options(int argc OVS_UNUSED, char *argv[] OVS_UNUSED)
3122 DAEMON_OPTION_ENUMS,
3125 static const struct option long_options[] = {
3126 {"ovnsb-db", required_argument, NULL, 'd'},
3127 {"ovnnb-db", required_argument, NULL, 'D'},
3128 {"help", no_argument, NULL, 'h'},
3129 {"options", no_argument, NULL, 'o'},
3130 {"version", no_argument, NULL, 'V'},
3131 DAEMON_LONG_OPTIONS,
3133 STREAM_SSL_LONG_OPTIONS,
3136 char *short_options = ovs_cmdl_long_options_to_short_options(long_options);
3141 c = getopt_long(argc, argv, short_options, long_options, NULL);
3147 DAEMON_OPTION_HANDLERS;
3148 VLOG_OPTION_HANDLERS;
3149 STREAM_SSL_OPTION_HANDLERS;
3164 ovs_cmdl_print_options(long_options);
3168 ovs_print_version(0, 0);
3177 ovnsb_db = default_sb_db();
3181 ovnnb_db = default_nb_db();
3184 free(short_options);
3188 add_column_noalert(struct ovsdb_idl *idl,
3189 const struct ovsdb_idl_column *column)
3191 ovsdb_idl_add_column(idl, column);
3192 ovsdb_idl_omit_alert(idl, column);
3196 main(int argc, char *argv[])
3198 int res = EXIT_SUCCESS;
3199 struct unixctl_server *unixctl;
3203 fatal_ignore_sigpipe();
3204 set_program_name(argv[0]);
3205 service_start(&argc, &argv);
3206 parse_options(argc, argv);
3208 daemonize_start(false);
3210 retval = unixctl_server_create(NULL, &unixctl);
3214 unixctl_command_register("exit", "", 0, 0, ovn_northd_exit, &exiting);
3216 daemonize_complete();
3221 /* We want to detect all changes to the ovn-nb db. */
3222 struct ovsdb_idl_loop ovnnb_idl_loop = OVSDB_IDL_LOOP_INITIALIZER(
3223 ovsdb_idl_create(ovnnb_db, &nbrec_idl_class, true, true));
3225 struct ovsdb_idl_loop ovnsb_idl_loop = OVSDB_IDL_LOOP_INITIALIZER(
3226 ovsdb_idl_create(ovnsb_db, &sbrec_idl_class, false, true));
3228 ovsdb_idl_add_table(ovnsb_idl_loop.idl, &sbrec_table_logical_flow);
3229 add_column_noalert(ovnsb_idl_loop.idl,
3230 &sbrec_logical_flow_col_logical_datapath);
3231 add_column_noalert(ovnsb_idl_loop.idl, &sbrec_logical_flow_col_pipeline);
3232 add_column_noalert(ovnsb_idl_loop.idl, &sbrec_logical_flow_col_table_id);
3233 add_column_noalert(ovnsb_idl_loop.idl, &sbrec_logical_flow_col_priority);
3234 add_column_noalert(ovnsb_idl_loop.idl, &sbrec_logical_flow_col_match);
3235 add_column_noalert(ovnsb_idl_loop.idl, &sbrec_logical_flow_col_actions);
3237 ovsdb_idl_add_table(ovnsb_idl_loop.idl, &sbrec_table_multicast_group);
3238 add_column_noalert(ovnsb_idl_loop.idl,
3239 &sbrec_multicast_group_col_datapath);
3240 add_column_noalert(ovnsb_idl_loop.idl,
3241 &sbrec_multicast_group_col_tunnel_key);
3242 add_column_noalert(ovnsb_idl_loop.idl, &sbrec_multicast_group_col_name);
3243 add_column_noalert(ovnsb_idl_loop.idl, &sbrec_multicast_group_col_ports);
3245 ovsdb_idl_add_table(ovnsb_idl_loop.idl, &sbrec_table_datapath_binding);
3246 add_column_noalert(ovnsb_idl_loop.idl,
3247 &sbrec_datapath_binding_col_tunnel_key);
3248 add_column_noalert(ovnsb_idl_loop.idl,
3249 &sbrec_datapath_binding_col_external_ids);
3251 ovsdb_idl_add_table(ovnsb_idl_loop.idl, &sbrec_table_port_binding);
3252 add_column_noalert(ovnsb_idl_loop.idl, &sbrec_port_binding_col_datapath);
3253 add_column_noalert(ovnsb_idl_loop.idl,
3254 &sbrec_port_binding_col_logical_port);
3255 add_column_noalert(ovnsb_idl_loop.idl,
3256 &sbrec_port_binding_col_tunnel_key);
3257 add_column_noalert(ovnsb_idl_loop.idl,
3258 &sbrec_port_binding_col_parent_port);
3259 add_column_noalert(ovnsb_idl_loop.idl, &sbrec_port_binding_col_tag);
3260 add_column_noalert(ovnsb_idl_loop.idl, &sbrec_port_binding_col_type);
3261 add_column_noalert(ovnsb_idl_loop.idl, &sbrec_port_binding_col_options);
3262 add_column_noalert(ovnsb_idl_loop.idl, &sbrec_port_binding_col_mac);
3263 ovsdb_idl_add_column(ovnsb_idl_loop.idl, &sbrec_port_binding_col_chassis);
3265 ovsdb_idl_add_table(ovnsb_idl_loop.idl, &sbrec_table_address_set);
3266 add_column_noalert(ovnsb_idl_loop.idl, &sbrec_address_set_col_name);
3267 add_column_noalert(ovnsb_idl_loop.idl, &sbrec_address_set_col_addresses);
3272 struct northd_context ctx = {
3273 .ovnnb_idl = ovnnb_idl_loop.idl,
3274 .ovnnb_txn = ovsdb_idl_loop_run(&ovnnb_idl_loop),
3275 .ovnsb_idl = ovnsb_idl_loop.idl,
3276 .ovnsb_txn = ovsdb_idl_loop_run(&ovnsb_idl_loop),
3282 unixctl_server_run(unixctl);
3283 unixctl_server_wait(unixctl);
3285 poll_immediate_wake();
3287 ovsdb_idl_loop_commit_and_wait(&ovnnb_idl_loop);
3288 ovsdb_idl_loop_commit_and_wait(&ovnsb_idl_loop);
3291 if (should_service_stop()) {
3296 unixctl_server_destroy(unixctl);
3297 ovsdb_idl_loop_destroy(&ovnnb_idl_loop);
3298 ovsdb_idl_loop_destroy(&ovnsb_idl_loop);
3301 free(default_nb_db_);
3302 free(default_sb_db_);
3307 ovn_northd_exit(struct unixctl_conn *conn, int argc OVS_UNUSED,
3308 const char *argv[] OVS_UNUSED, void *exiting_)
3310 bool *exiting = exiting_;
3313 unixctl_command_reply(conn, NULL);