diff options
author | Ben Pfaff <blp@nicira.com> | 2009-07-08 13:19:16 -0700 |
---|---|---|
committer | Ben Pfaff <blp@nicira.com> | 2009-07-08 13:19:16 -0700 |
commit | 064af42167bf4fc9aaea2702d80ce08074b889c0 (patch) | |
tree | efd15a6dc2402eeec273bb34db3b2445687589e5 /vswitchd |
Import from old repository commit 61ef2b42a9c4ba8e1600f15bb0236765edc2ad45.v0.90.0
Diffstat (limited to 'vswitchd')
-rw-r--r-- | vswitchd/.gitignore | 7 | ||||
-rw-r--r-- | vswitchd/automake.mk | 40 | ||||
-rw-r--r-- | vswitchd/bridge.c | 3058 | ||||
-rw-r--r-- | vswitchd/bridge.h | 43 | ||||
-rw-r--r-- | vswitchd/mgmt.c | 679 | ||||
-rw-r--r-- | vswitchd/mgmt.h | 36 | ||||
-rw-r--r-- | vswitchd/ovs-brcompatd.8.in | 49 | ||||
-rw-r--r-- | vswitchd/ovs-brcompatd.c | 766 | ||||
-rw-r--r-- | vswitchd/ovs-vswitchd.8.in | 87 | ||||
-rw-r--r-- | vswitchd/ovs-vswitchd.c | 255 | ||||
-rw-r--r-- | vswitchd/ovs-vswitchd.conf.5.in | 642 | ||||
-rw-r--r-- | vswitchd/ovs-vswitchd.h | 32 | ||||
-rw-r--r-- | vswitchd/port.c | 68 | ||||
-rw-r--r-- | vswitchd/port.h | 33 | ||||
-rw-r--r-- | vswitchd/proc-net-compat.c | 344 | ||||
-rw-r--r-- | vswitchd/proc-net-compat.h | 51 | ||||
-rw-r--r-- | vswitchd/xenserver.c | 90 | ||||
-rw-r--r-- | vswitchd/xenserver.h | 32 |
18 files changed, 6312 insertions, 0 deletions
diff --git a/vswitchd/.gitignore b/vswitchd/.gitignore new file mode 100644 index 00000000..01d57ae7 --- /dev/null +++ b/vswitchd/.gitignore @@ -0,0 +1,7 @@ +/Makefile +/Makefile.in +/ovs-brcompatd +/ovs-brcompatd.8 +/ovs-vswitchd +/ovs-vswitchd.8 +/ovs-vswitchd.conf.5 diff --git a/vswitchd/automake.mk b/vswitchd/automake.mk new file mode 100644 index 00000000..6883731e --- /dev/null +++ b/vswitchd/automake.mk @@ -0,0 +1,40 @@ +sbin_PROGRAMS += vswitchd/ovs-vswitchd vswitchd/ovs-brcompatd +man_MANS += \ + vswitchd/ovs-vswitchd.conf.5 \ + vswitchd/ovs-vswitchd.8 \ + vswitchd/ovs-brcompatd.8 +DISTCLEANFILES += \ + vswitchd/ovs-vswitchd.conf.5 \ + vswitchd/ovs-vswitchd.8 \ + vswitchd/ovs-brcompatd.8 + +vswitchd_ovs_vswitchd_SOURCES = \ + vswitchd/bridge.c \ + vswitchd/bridge.h \ + vswitchd/mgmt.c \ + vswitchd/mgmt.h \ + vswitchd/port.c \ + vswitchd/port.h \ + vswitchd/proc-net-compat.c \ + vswitchd/proc-net-compat.h \ + vswitchd/ovs-vswitchd.c \ + vswitchd/ovs-vswitchd.h \ + vswitchd/xenserver.c \ + vswitchd/xenserver.h +vswitchd_ovs_vswitchd_LDADD = \ + secchan/libsecchan.a \ + lib/libopenvswitch.a \ + $(FAULT_LIBS) \ + $(SSL_LIBS) + +vswitchd_ovs_brcompatd_SOURCES = \ + vswitchd/ovs-brcompatd.c + +vswitchd_ovs_brcompatd_LDADD = \ + lib/libopenvswitch.a \ + $(FAULT_LIBS) + +EXTRA_DIST += \ + vswitchd/ovs-vswitchd.conf.5.in \ + vswitchd/ovs-vswitchd.8.in \ + vswitchd/ovs-brcompatd.8.in diff --git a/vswitchd/bridge.c b/vswitchd/bridge.c new file mode 100644 index 00000000..cfd4dcf7 --- /dev/null +++ b/vswitchd/bridge.c @@ -0,0 +1,3058 @@ +/* Copyright (c) 2008, 2009 Nicira Networks + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * In addition, as a special exception, Nicira Networks gives permission + * to link the code of its release of vswitchd with the OpenSSL project's + * "OpenSSL" library (or with modified versions of it that use the same + * license as the "OpenSSL" library), and distribute the linked + * executables. You must obey the GNU General Public License in all + * respects for all of the code used other than "OpenSSL". If you modify + * this file, you may extend this exception to your version of the file, + * but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. + * + */ + +#include <config.h> +#include "bridge.h" +#include <assert.h> +#include <errno.h> +#include <arpa/inet.h> +#include <ctype.h> +#include <inttypes.h> +#include <net/if.h> +#include <openflow/openflow.h> +#include <signal.h> +#include <stdlib.h> +#include <strings.h> +#include <sys/stat.h> +#include <sys/socket.h> +#include <unistd.h> +#include "bitmap.h" +#include "cfg.h" +#include "coverage.h" +#include "dirs.h" +#include "dpif.h" +#include "dynamic-string.h" +#include "flow.h" +#include "hash.h" +#include "list.h" +#include "mac-learning.h" +#include "netdev.h" +#include "odp-util.h" +#include "ofp-print.h" +#include "ofpbuf.h" +#include "poll-loop.h" +#include "port-array.h" +#include "proc-net-compat.h" +#include "process.h" +#include "secchan/ofproto.h" +#include "socket-util.h" +#include "stp.h" +#include "svec.h" +#include "timeval.h" +#include "util.h" +#include "vconn.h" +#include "vconn-ssl.h" +#include "xenserver.h" +#include "xtoxll.h" + +#define THIS_MODULE VLM_bridge +#include "vlog.h" + +struct dst { + uint16_t vlan; + uint16_t dp_ifidx; +}; + +extern uint64_t mgmt_id; + +struct iface { + struct port *port; /* Containing port. */ + size_t port_ifidx; /* Index within containing port. */ + + char *name; /* Host network device name. */ + int dp_ifidx; /* Index within kernel datapath. */ + + uint8_t mac[ETH_ADDR_LEN]; /* Ethernet address (all zeros if unknowns). */ + + tag_type tag; /* Tag associated with this interface. */ + bool enabled; /* May be chosen for flows? */ + long long delay_expires; /* Time after which 'enabled' may change. */ +}; + +#define BOND_MASK 0xff +struct bond_entry { + int iface_idx; /* Index of assigned iface, or -1 if none. */ + uint64_t tx_bytes; /* Count of bytes recently transmitted. */ + tag_type iface_tag; /* Tag associated with iface_idx. */ +}; + +#define MAX_MIRRORS 32 +typedef uint32_t mirror_mask_t; +#define MIRROR_MASK_C(X) UINT32_C(X) +BUILD_ASSERT_DECL(sizeof(mirror_mask_t) * CHAR_BIT >= MAX_MIRRORS); +struct mirror { + struct bridge *bridge; + size_t idx; + char *name; + + /* Selection criteria. */ + struct svec src_ports; + struct svec dst_ports; + int *vlans; + size_t n_vlans; + + /* Output. */ + struct port *out_port; + int out_vlan; +}; + +#define FLOOD_PORT ((struct port *) 1) /* The 'flood' output port. */ +struct port { + struct bridge *bridge; + size_t port_idx; + int vlan; /* -1=trunk port, else a 12-bit VLAN ID. */ + unsigned long *trunks; /* Bitmap of trunked VLANs, if 'vlan' == -1. */ + char *name; + + /* An ordinary bridge port has 1 interface. + * A bridge port for bonding has at least 2 interfaces. */ + struct iface **ifaces; + size_t n_ifaces, allocated_ifaces; + + /* Bonding info. */ + struct bond_entry *bond_hash; /* An array of (BOND_MASK + 1) elements. */ + int active_iface; /* Ifidx on which bcasts accepted, or -1. */ + tag_type active_iface_tag; /* Tag for bcast flows. */ + tag_type no_ifaces_tag; /* Tag for flows when all ifaces disabled. */ + int updelay, downdelay; /* Delay before iface goes up/down, in ms. */ + + /* Port mirroring info. */ + mirror_mask_t src_mirrors; /* Mirrors triggered when packet received. */ + mirror_mask_t dst_mirrors; /* Mirrors triggered when packet sent. */ + bool is_mirror_output_port; /* Does port mirroring send frames here? */ + + /* Spanning tree info. */ + enum stp_state stp_state; /* Always STP_FORWARDING if STP not in use. */ + tag_type stp_state_tag; /* Tag for STP state change. */ +}; + +#define DP_MAX_PORTS 255 +struct bridge { + struct list node; /* Node in global list of bridges. */ + char *name; /* User-specified arbitrary name. */ + struct mac_learning *ml; /* MAC learning table, or null not to learn. */ + bool sent_config_request; /* Successfully sent config request? */ + uint8_t default_ea[ETH_ADDR_LEN]; /* Default MAC. */ + + /* Support for remote controllers. */ + char *controller; /* NULL if there is no remote controller; + * "discover" to do controller discovery; + * otherwise a vconn name. */ + + /* OpenFlow switch processing. */ + struct ofproto *ofproto; /* OpenFlow switch. */ + + /* Kernel datapath information. */ + struct dpif dpif; /* Kernel datapath. */ + struct port_array ifaces; /* Indexed by kernel datapath port number. */ + + /* Bridge ports. */ + struct port **ports; + size_t n_ports, allocated_ports; + + /* Bonding. */ + bool has_bonded_ports; + long long int bond_next_rebalance; + + /* Flow tracking. */ + bool flush; + + /* Flow statistics gathering. */ + time_t next_stats_request; + + /* Port mirroring. */ + struct mirror *mirrors[MAX_MIRRORS]; + + /* Spanning tree. */ + struct stp *stp; + long long int stp_last_tick; +}; + +/* List of all bridges. */ +static struct list all_bridges = LIST_INITIALIZER(&all_bridges); + +/* Maximum number of datapaths. */ +enum { DP_MAX = 256 }; + +static struct bridge *bridge_create(const char *name); +static void bridge_destroy(struct bridge *); +static struct bridge *bridge_lookup(const char *name); +static int bridge_run_one(struct bridge *); +static void bridge_reconfigure_one(struct bridge *); +static void bridge_reconfigure_controller(struct bridge *); +static void bridge_get_all_ifaces(const struct bridge *, struct svec *ifaces); +static void bridge_fetch_dp_ifaces(struct bridge *); +static void bridge_flush(struct bridge *); +static void bridge_pick_local_hw_addr(struct bridge *, + uint8_t ea[ETH_ADDR_LEN], + const char **devname); +static uint64_t bridge_pick_datapath_id(struct bridge *, + const uint8_t bridge_ea[ETH_ADDR_LEN], + const char *devname); +static uint64_t dpid_from_hash(const void *, size_t nbytes); + +static void bond_run(struct bridge *); +static void bond_wait(struct bridge *); +static void bond_rebalance_port(struct port *); + +static void port_create(struct bridge *, const char *name); +static void port_reconfigure(struct port *); +static void port_destroy(struct port *); +static struct port *port_lookup(const struct bridge *, const char *name); +static struct port *port_from_dp_ifidx(const struct bridge *, + uint16_t dp_ifidx); +static void port_update_bond_compat(struct port *); +static void port_update_vlan_compat(struct port *); + +static void mirror_create(struct bridge *, const char *name); +static void mirror_destroy(struct mirror *); +static void mirror_reconfigure(struct bridge *); +static void mirror_reconfigure_one(struct mirror *); +static bool vlan_is_mirrored(const struct mirror *, int vlan); + +static void brstp_reconfigure(struct bridge *); +static void brstp_adjust_timers(struct bridge *); +static void brstp_run(struct bridge *); +static void brstp_wait(struct bridge *); + +static void iface_create(struct port *, const char *name); +static void iface_destroy(struct iface *); +static struct iface *iface_lookup(const struct bridge *, const char *name); +static struct iface *iface_from_dp_ifidx(const struct bridge *, + uint16_t dp_ifidx); + +/* Hooks into ofproto processing. */ +static struct ofhooks bridge_ofhooks; + +/* Public functions. */ + +/* Adds the name of each interface used by a bridge, including local and + * internal ports, to 'svec'. */ +void +bridge_get_ifaces(struct svec *svec) +{ + struct bridge *br, *next; + size_t i, j; + + LIST_FOR_EACH_SAFE (br, next, struct bridge, node, &all_bridges) { + for (i = 0; i < br->n_ports; i++) { + struct port *port = br->ports[i]; + + for (j = 0; j < port->n_ifaces; j++) { + struct iface *iface = port->ifaces[j]; + if (iface->dp_ifidx < 0) { + VLOG_ERR("%s interface not in dp%u, ignoring", + iface->name, dpif_id(&br->dpif)); + } else { + if (iface->dp_ifidx != ODPP_LOCAL) { + svec_add(svec, iface->name); + } + } + } + } + } +} + +/* The caller must already have called cfg_read(). */ +void +bridge_init(void) +{ + int retval; + int i; + + for (i = 0; i < DP_MAX; i++) { + struct dpif dpif; + char devname[16]; + + sprintf(devname, "dp%d", i); + retval = dpif_open(devname, &dpif); + if (!retval) { + char dpif_name[IF_NAMESIZE]; + if (dpif_get_name(&dpif, dpif_name, sizeof dpif_name) + || !cfg_has("bridge.%s.port", dpif_name)) { + dpif_delete(&dpif); + } + dpif_close(&dpif); + } else if (retval != ENODEV) { + VLOG_ERR("failed to delete datapath dp%d: %s", + i, strerror(retval)); + } + } + + bridge_reconfigure(); +} + +#ifdef HAVE_OPENSSL +static bool +config_string_change(const char *key, char **valuep) +{ + const char *value = cfg_get_string(0, "%s", key); + if (value && (!*valuep || strcmp(value, *valuep))) { + free(*valuep); + *valuep = xstrdup(value); + return true; + } else { + return false; + } +} + +static void +bridge_configure_ssl(void) +{ + /* XXX SSL should be configurable on a per-bridge basis. + * XXX should be possible to de-configure SSL. */ + static char *private_key_file; + static char *certificate_file; + static char *cacert_file; + + if (config_string_change("ssl.private-key", &private_key_file)) { + vconn_ssl_set_private_key_file(private_key_file); + } + + if (config_string_change("ssl.certificate", &certificate_file)) { + vconn_ssl_set_certificate_file(certificate_file); + } + + if (config_string_change("ssl.ca-cert", &cacert_file)) { + vconn_ssl_set_ca_cert_file(cacert_file, + cfg_get_bool(0, "ssl.bootstrap-ca-cert")); + } +} +#endif + +void +bridge_reconfigure(void) +{ + struct svec old_br, new_br, raw_new_br; + struct bridge *br, *next; + size_t i, j; + + COVERAGE_INC(bridge_reconfigure); + + /* Collect old bridges. */ + svec_init(&old_br); + LIST_FOR_EACH (br, struct bridge, node, &all_bridges) { + svec_add(&old_br, br->name); + } + + /* Collect new bridges. */ + svec_init(&raw_new_br); + cfg_get_subsections(&raw_new_br, "bridge"); + svec_init(&new_br); + for (i = 0; i < raw_new_br.n; i++) { + const char *name = raw_new_br.names[i]; + if ((!strncmp(name, "dp", 2) && isdigit(name[2])) || + (!strncmp(name, "nl:", 3) && isdigit(name[3]))) { + VLOG_ERR("%s is not a valid bridge name (bridges may not be " + "named \"dp\" or \"nl:\" followed by a digit)", name); + } else { + svec_add(&new_br, name); + } + } + svec_destroy(&raw_new_br); + + /* Get rid of deleted bridges and add new bridges. */ + svec_sort(&old_br); + svec_sort(&new_br); + assert(svec_is_unique(&old_br)); + assert(svec_is_unique(&new_br)); + LIST_FOR_EACH_SAFE (br, next, struct bridge, node, &all_bridges) { + if (!svec_contains(&new_br, br->name)) { + bridge_destroy(br); + } + } + for (i = 0; i < new_br.n; i++) { + const char *name = new_br.names[i]; + if (!svec_contains(&old_br, name)) { + bridge_create(name); + } + } + svec_destroy(&old_br); + svec_destroy(&new_br); + +#ifdef HAVE_OPENSSL + /* Configure SSL. */ + bridge_configure_ssl(); +#endif + + /* Reconfigure all bridges. */ + LIST_FOR_EACH (br, struct bridge, node, &all_bridges) { + bridge_reconfigure_one(br); + } + + /* Add and delete ports on all datapaths. + * + * The kernel will reject any attempt to add a given port to a datapath if + * that port already belongs to a different datapath, so we must do all + * port deletions before any port additions. */ + LIST_FOR_EACH (br, struct bridge, node, &all_bridges) { + struct odp_port *dpif_ports; + size_t n_dpif_ports; + struct svec want_ifaces; + + dpif_port_list(&br->dpif, &dpif_ports, &n_dpif_ports); + bridge_get_all_ifaces(br, &want_ifaces); + for (i = 0; i < n_dpif_ports; i++) { + const struct odp_port *p = &dpif_ports[i]; + if (!svec_contains(&want_ifaces, p->devname) + && strcmp(p->devname, br->name)) { + int retval = dpif_port_del(&br->dpif, p->port); + if (retval) { + VLOG_ERR("failed to remove %s interface from dp%u: %s", + p->devname, dpif_id(&br->dpif), strerror(retval)); + } + } + } + svec_destroy(&want_ifaces); + free(dpif_ports); + } + LIST_FOR_EACH (br, struct bridge, node, &all_bridges) { + struct odp_port *dpif_ports; + size_t n_dpif_ports; + struct svec cur_ifaces, want_ifaces, add_ifaces; + int next_port_no; + + dpif_port_list(&br->dpif, &dpif_ports, &n_dpif_ports); + svec_init(&cur_ifaces); + for (i = 0; i < n_dpif_ports; i++) { + svec_add(&cur_ifaces, dpif_ports[i].devname); + } + free(dpif_ports); + svec_sort_unique(&cur_ifaces); + bridge_get_all_ifaces(br, &want_ifaces); + svec_diff(&want_ifaces, &cur_ifaces, &add_ifaces, NULL, NULL); + + next_port_no = 1; + for (i = 0; i < add_ifaces.n; i++) { + const char *if_name = add_ifaces.names[i]; + for (;;) { + int internal = cfg_get_bool(0, "iface.%s.internal", if_name); + int error = dpif_port_add(&br->dpif, if_name, next_port_no++, + internal ? ODP_PORT_INTERNAL : 0); + if (error != EEXIST) { + if (next_port_no >= 256) { + VLOG_ERR("ran out of valid port numbers on dp%u", + dpif_id(&br->dpif)); + goto out; + } + if (error) { + VLOG_ERR("failed to add %s interface to dp%u: %s", + if_name, dpif_id(&br->dpif), strerror(error)); + } + break; + } + } + } + out: + svec_destroy(&cur_ifaces); + svec_destroy(&want_ifaces); + svec_destroy(&add_ifaces); + } + LIST_FOR_EACH (br, struct bridge, node, &all_bridges) { + uint8_t ea[8]; + uint64_t dpid; + struct iface *local_iface = NULL; + const char *devname; + uint8_t engine_type = br->dpif.minor; + uint8_t engine_id = br->dpif.minor; + bool add_id_to_iface = false; + struct svec nf_hosts; + + + bridge_fetch_dp_ifaces(br); + for (i = 0; i < br->n_ports; ) { + struct port *port = br->ports[i]; + + for (j = 0; j < port->n_ifaces; ) { + struct iface *iface = port->ifaces[j]; + if (iface->dp_ifidx < 0) { + VLOG_ERR("%s interface not in dp%u, dropping", + iface->name, dpif_id(&br->dpif)); + iface_destroy(iface); + } else { + if (iface->dp_ifidx == ODPP_LOCAL) { + local_iface = iface; + } + VLOG_DBG("dp%u has interface %s on port %d", + dpif_id(&br->dpif), iface->name, iface->dp_ifidx); + j++; + } + } + if (!port->n_ifaces) { + VLOG_ERR("%s port has no interfaces, dropping", port->name); + port_destroy(port); + continue; + } + i++; + } + + /* Pick local port hardware address, datapath ID. */ + bridge_pick_local_hw_addr(br, ea, &devname); + if (local_iface) { + int error = netdev_nodev_set_etheraddr(local_iface->name, ea); + if (error) { + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); + VLOG_ERR_RL(&rl, "bridge %s: failed to set bridge " + "Ethernet address: %s", + br->name, strerror(error)); + } + } + + dpid = bridge_pick_datapath_id(br, ea, devname); + ofproto_set_datapath_id(br->ofproto, dpid); + + /* Set NetFlow configuration on this bridge. */ + if (cfg_has("netflow.%s.engine-type", br->name)) { + engine_type = cfg_get_int(0, "netflow.%s.engine-type", + br->name); + } + if (cfg_has("netflow.%s.engine-id", br->name)) { + engine_id = cfg_get_int(0, "netflow.%s.engine-id", br->name); + } + if (cfg_has("netflow.%s.add-id-to-iface", br->name)) { + add_id_to_iface = cfg_get_bool(0, "netflow.%s.add-id-to-iface", + br->name); + } + if (add_id_to_iface && engine_id > 0x7f) { + VLOG_WARN("bridge %s: netflow port mangling may conflict with " + "another vswitch, choose an engine id less than 128", + br->name); + } + if (add_id_to_iface && br->n_ports > 0x1ff) { + VLOG_WARN("bridge %s: netflow port mangling will conflict with " + "another port when 512 or more ports are used", + br->name); + } + svec_init(&nf_hosts); + cfg_get_all_keys(&nf_hosts, "netflow.%s.host", br->name); + if (ofproto_set_netflow(br->ofproto, &nf_hosts, engine_type, + engine_id, add_id_to_iface)) { + VLOG_ERR("bridge %s: problem setting netflow collectors", + br->name); + } + + /* Update the controller and related settings. It would be more + * straightforward to call this from bridge_reconfigure_one(), but we + * can't do it there for two reasons. First, and most importantly, at + * that point we don't know the dp_ifidx of any interfaces that have + * been added to the bridge (because we haven't actually added them to + * the datapath). Second, at that point we haven't set the datapath ID + * yet; when a controller is configured, resetting the datapath ID will + * immediately disconnect from the controller, so it's better to set + * the datapath ID before the controller. */ + bridge_reconfigure_controller(br); + } + LIST_FOR_EACH (br, struct bridge, node, &all_bridges) { + for (i = 0; i < br->n_ports; i++) { + struct port *port = br->ports[i]; + port_update_vlan_compat(port); + } + } + LIST_FOR_EACH (br, struct bridge, node, &all_bridges) { + brstp_reconfigure(br); + } +} + +static void +bridge_pick_local_hw_addr(struct bridge *br, uint8_t ea[ETH_ADDR_LEN], + const char **devname) +{ + uint64_t requested_ea; + size_t i, j; + int error; + + *devname = NULL; + + /* Did the user request a particular MAC? */ + requested_ea = cfg_get_mac(0, "bridge.%s.mac", br->name); + if (requested_ea) { + eth_addr_from_uint64(requested_ea, ea); + if (eth_addr_is_multicast(ea)) { + VLOG_ERR("bridge %s: cannot set MAC address to multicast " + "address "ETH_ADDR_FMT, br->name, ETH_ADDR_ARGS(ea)); + } else if (eth_addr_is_zero(ea)) { + VLOG_ERR("bridge %s: cannot set MAC address to zero", br->name); + } else { + return; + } + } + + /* Otherwise choose the minimum MAC address among all of the interfaces. + * (Xen uses FE:FF:FF:FF:FF:FF for virtual interfaces so this will get the + * MAC of the physical interface in such an environment.) */ + memset(ea, 0xff, sizeof ea); + for (i = 0; i < br->n_ports; i++) { + struct port *port = br->ports[i]; + if (port->is_mirror_output_port) { + continue; + } + for (j = 0; j < port->n_ifaces; j++) { + struct iface *iface = port->ifaces[j]; + uint8_t iface_ea[ETH_ADDR_LEN]; + if (iface->dp_ifidx == ODPP_LOCAL + || cfg_get_bool(0, "iface.%s.internal", iface->name)) { + continue; + } + error = netdev_nodev_get_etheraddr(iface->name, iface_ea); + if (!error) { + if (!eth_addr_is_multicast(iface_ea) && + !eth_addr_is_reserved(iface_ea) && + !eth_addr_is_zero(iface_ea) && + memcmp(iface_ea, ea, ETH_ADDR_LEN) < 0) { + memcpy(ea, iface_ea, ETH_ADDR_LEN); + *devname = iface->name; + } + } else { + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); + VLOG_ERR_RL(&rl, "failed to obtain Ethernet address of %s: %s", + iface->name, strerror(error)); + } + } + } + if (eth_addr_is_multicast(ea) || eth_addr_is_vif(ea)) { + memcpy(ea, br->default_ea, ETH_ADDR_LEN); + *devname = NULL; + VLOG_WARN("bridge %s: using default bridge Ethernet " + "address "ETH_ADDR_FMT, br->name, ETH_ADDR_ARGS(ea)); + } else { + VLOG_DBG("bridge %s: using bridge Ethernet address "ETH_ADDR_FMT, + br->name, ETH_ADDR_ARGS(ea)); + } +} + +/* Choose and returns the datapath ID for bridge 'br' given that the bridge + * Ethernet address is 'bridge_ea'. If 'bridge_ea' is the Ethernet address of + * a network device, then that network device's name must be passed in as + * 'devname'; if 'bridge_ea' was derived some other way, then 'devname' must be + * passed in as a null pointer. */ +static uint64_t +bridge_pick_datapath_id(struct bridge *br, + const uint8_t bridge_ea[ETH_ADDR_LEN], + const char *devname) +{ + /* + * The procedure for choosing a bridge MAC address will, in the most + * ordinary case, also choose a unique MAC that we can use as a datapath + * ID. In some special cases, though, multiple bridges will end up with + * the same MAC address. This is OK for the bridges, but it will confuse + * the OpenFlow controller, because each datapath needs a unique datapath + * ID. + * + * Datapath IDs must be unique. It is also very desirable that they be + * stable from one run to the next, so that policy set on a datapath + * "sticks". + */ + uint64_t dpid; + + dpid = cfg_get_dpid(0, "bridge.%s.datapath-id", br->name); + if (dpid) { + return dpid; + } + + if (devname) { + int vlan; + if (!netdev_get_vlan_vid(devname, &vlan)) { + /* + * A bridge whose MAC address is taken from a VLAN network device + * (that is, a network device created with vconfig(8) or similar + * tool) will have the same MAC address as a bridge on the VLAN + * device's physical network device. + * + * Handle this case by hashing the physical network device MAC + * along with the VLAN identifier. + */ + uint8_t buf[ETH_ADDR_LEN + 2]; + memcpy(buf, bridge_ea, ETH_ADDR_LEN); + buf[ETH_ADDR_LEN] = vlan >> 8; + buf[ETH_ADDR_LEN + 1] = vlan; + return dpid_from_hash(buf, sizeof buf); + } else { + /* + * Assume that this bridge's MAC address is unique, since it + * doesn't fit any of the cases we handle specially. + */ + } + } else { + /* + * A purely internal bridge, that is, one that has no non-virtual + * network devices on it at all, is more difficult because it has no + * natural unique identifier at all. + * + * When the host is a XenServer, we handle this case by hashing the + * host's UUID with the name of the bridge. Names of bridges are + * persistent across XenServer reboots, although they can be reused if + * an internal network is destroyed and then a new one is later + * created, so this is fairly effective. + * + * When the host is not a XenServer, we punt by using a random MAC + * address on each run. + */ + const char *host_uuid = xenserver_get_host_uuid(); + if (host_uuid) { + char *combined = xasprintf("%s,%s", host_uuid, br->name); + dpid = dpid_from_hash(combined, strlen(combined)); + free(combined); + return dpid; + } + } + + return eth_addr_to_uint64(bridge_ea); +} + +static uint64_t +dpid_from_hash(const void *data, size_t n) +{ + uint8_t hash[SHA1HashSize]; + + BUILD_ASSERT_DECL(sizeof hash >= ETH_ADDR_LEN); + SHA1Bytes(data, n, hash); + eth_addr_mark_random(hash); + return eth_addr_to_uint64(hash); +} + +int +bridge_run(void) +{ + struct bridge *br, *next; + int retval; + + retval = 0; + LIST_FOR_EACH_SAFE (br, next, struct bridge, node, &all_bridges) { + int error = bridge_run_one(br); + if (error) { + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); + VLOG_ERR_RL(&rl, "bridge %s: datapath was destroyed externally, " + "forcing reconfiguration", br->name); + if (!retval) { + retval = error; + } + } + } + return retval; +} + +void +bridge_wait(void) +{ + struct bridge *br; + + LIST_FOR_EACH (br, struct bridge, node, &all_bridges) { + ofproto_wait(br->ofproto); + if (br->controller) { + continue; + } + + if (br->ml) { + mac_learning_wait(br->ml); + } + bond_wait(br); + brstp_wait(br); + } +} + +/* Forces 'br' to revalidate all of its flows. This is appropriate when 'br''s + * configuration changes. */ +static void +bridge_flush(struct bridge *br) +{ + COVERAGE_INC(bridge_flush); + br->flush = true; + if (br->ml) { + mac_learning_flush(br->ml); + } +} + +/* Bridge reconfiguration functions. */ + +static struct bridge * +bridge_create(const char *name) +{ + struct bridge *br; + int error; + + assert(!bridge_lookup(name)); + br = xcalloc(1, sizeof *br); + + error = dpif_create(name, &br->dpif); + if (error == EEXIST) { + error = dpif_open(name, &br->dpif); + if (error) { + VLOG_ERR("datapath %s already exists but cannot be opened: %s", + name, strerror(error)); + free(br); + return NULL; + } + dpif_flow_flush(&br->dpif); + } else if (error) { + VLOG_ERR("failed to create datapath %s: %s", name, strerror(error)); + free(br); + return NULL; + } + + error = ofproto_create(name, &bridge_ofhooks, br, &br->ofproto); + if (error) { + VLOG_ERR("failed to create switch %s: %s", name, strerror(error)); + dpif_delete(&br->dpif); + dpif_close(&br->dpif); + free(br); + return NULL; + } + + br->name = xstrdup(name); + br->ml = mac_learning_create(); + br->sent_config_request = false; + eth_addr_random(br->default_ea); + + port_array_init(&br->ifaces); + + br->flush = false; + br->bond_next_rebalance = time_msec() + 10000; + + list_push_back(&all_bridges, &br->node); + + VLOG_INFO("created bridge %s on dp%u", br->name, dpif_id(&br->dpif)); + + return br; +} + +static void +bridge_destroy(struct bridge *br) +{ + if (br) { + int error; + + while (br->n_ports > 0) { + port_destroy(br->ports[br->n_ports - 1]); + } + list_remove(&br->node); + error = dpif_delete(&br->dpif); + if (error && error != ENOENT) { + VLOG_ERR("failed to delete dp%u: %s", + dpif_id(&br->dpif), strerror(error)); + } + dpif_close(&br->dpif); + ofproto_destroy(br->ofproto); + free(br->controller); + mac_learning_destroy(br->ml); + port_array_destroy(&br->ifaces); + free(br->ports); + free(br->name); + free(br); + } +} + +static struct bridge * +bridge_lookup(const char *name) +{ + struct bridge *br; + + LIST_FOR_EACH (br, struct bridge, node, &all_bridges) { + if (!strcmp(br->name, name)) { + return br; + } + } + return NULL; +} + +bool +bridge_exists(const char *name) +{ + return bridge_lookup(name) ? true : false; +} + +uint64_t +bridge_get_datapathid(const char *name) +{ + struct bridge *br = bridge_lookup(name); + return br ? ofproto_get_datapath_id(br->ofproto) : 0; +} + +static int +bridge_run_one(struct bridge *br) +{ + int error; + + error = ofproto_run1(br->ofproto); + if (error) { + return error; + } + + if (br->ml) { + mac_learning_run(br->ml, ofproto_get_revalidate_set(br->ofproto)); + } + bond_run(br); + brstp_run(br); + + error = ofproto_run2(br->ofproto, br->flush); + br->flush = false; + + return error; +} + +static const char * +bridge_get_controller(const struct bridge *br) +{ + const char *controller; + + controller = cfg_get_string(0, "bridge.%s.controller", br->name); + if (!controller) { + controller = cfg_get_string(0, "mgmt.controller"); + } + return controller && controller[0] ? controller : NULL; +} + +static void +bridge_reconfigure_one(struct bridge *br) +{ + struct svec old_ports, new_ports, ifaces; + struct svec listeners, old_listeners; + struct svec snoops, old_snoops; + size_t i, j; + + /* Collect old ports. */ + svec_init(&old_ports); + for (i = 0; i < br->n_ports; i++) { + svec_add(&old_ports, br->ports[i]->name); + } + svec_sort(&old_ports); + assert(svec_is_unique(&old_ports)); + + /* Collect new ports. */ + svec_init(&new_ports); + cfg_get_all_keys(&new_ports, "bridge.%s.port", br->name); + svec_sort(&new_ports); + if (bridge_get_controller(br) && !svec_contains(&new_ports, br->name)) { + svec_add(&new_ports, br->name); + svec_sort(&new_ports); + } + if (!svec_is_unique(&new_ports)) { + VLOG_WARN("bridge %s: %s specified twice as bridge port", + br->name, svec_get_duplicate(&new_ports)); + svec_unique(&new_ports); + } + + ofproto_set_mgmt_id(br->ofproto, mgmt_id); + + /* Get rid of deleted ports and add new ports. */ + for (i = 0; i < br->n_ports; ) { + struct port *port = br->ports[i]; + if (!svec_contains(&new_ports, port->name)) { + port_destroy(port); + } else { + i++; + } + } + for (i = 0; i < new_ports.n; i++) { + const char *name = new_ports.names[i]; + if (!svec_contains(&old_ports, name)) { + port_create(br, name); + } + } + svec_destroy(&old_ports); + svec_destroy(&new_ports); + + /* Reconfigure all ports. */ + for (i = 0; i < br->n_ports; i++) { + port_reconfigure(br->ports[i]); + } + + /* Check and delete duplicate interfaces. */ + svec_init(&ifaces); + for (i = 0; i < br->n_ports; ) { + struct port *port = br->ports[i]; + for (j = 0; j < port->n_ifaces; ) { + struct iface *iface = port->ifaces[j]; + if (svec_contains(&ifaces, iface->name)) { + VLOG_ERR("bridge %s: %s interface is on multiple ports, " + "removing from %s", + br->name, iface->name, port->name); + iface_destroy(iface); + } else { + svec_add(&ifaces, iface->name); + svec_sort(&ifaces); + j++; + } + } + if (!port->n_ifaces) { + VLOG_ERR("%s port has no interfaces, dropping", port->name); + port_destroy(port); + } else { + i++; + } + } + svec_destroy(&ifaces); + + /* Delete all flows if we're switching from connected to standalone or vice + * versa. (XXX Should we delete all flows if we are switching from one + * controller to another?) */ + + /* Configure OpenFlow management listeners. */ + svec_init(&listeners); + cfg_get_all_strings(&listeners, "bridge.%s.openflow.listeners", br->name); + if (!listeners.n) { + svec_add_nocopy(&listeners, xasprintf("punix:%s/%s.mgmt", + ovs_rundir, br->name)); + } else if (listeners.n == 1 && !strcmp(listeners.names[0], "none")) { + svec_clear(&listeners); + } + svec_sort_unique(&listeners); + + svec_init(&old_listeners); + ofproto_get_listeners(br->ofproto, &old_listeners); + svec_sort_unique(&old_listeners); + + if (!svec_equal(&listeners, &old_listeners)) { + ofproto_set_listeners(br->ofproto, &listeners); + } + svec_destroy(&listeners); + svec_destroy(&old_listeners); + + /* Configure OpenFlow controller connection snooping. */ + svec_init(&snoops); + cfg_get_all_strings(&snoops, "bridge.%s.openflow.snoops", br->name); + if (!snoops.n) { + svec_add_nocopy(&snoops, xasprintf("punix:%s/%s.snoop", + ovs_rundir, br->name)); + } else if (snoops.n == 1 && !strcmp(snoops.names[0], "none")) { + svec_clear(&snoops); + } + svec_sort_unique(&snoops); + + svec_init(&old_snoops); + ofproto_get_snoops(br->ofproto, &old_snoops); + svec_sort_unique(&old_snoops); + + if (!svec_equal(&snoops, &old_snoops)) { + ofproto_set_snoops(br->ofproto, &snoops); + } + svec_destroy(&snoops); + svec_destroy(&old_snoops); + + mirror_reconfigure(br); +} + +static void +bridge_reconfigure_controller(struct bridge *br) +{ + char *pfx = xasprintf("bridge.%s.controller", br->name); + const char *controller; + + controller = bridge_get_controller(br); + if ((br->controller != NULL) != (controller != NULL)) { + ofproto_flush_flows(br->ofproto); + } + free(br->controller); + br->controller = controller ? xstrdup(controller) : NULL; + + if (controller) { + const char *fail_mode; + int max_backoff, probe; + int rate_limit, burst_limit; + + if (!strcmp(controller, "discover")) { + ofproto_set_discovery(br->ofproto, true, + cfg_get_string(0, "%s.accept-regex", pfx), + cfg_get_bool(0, "%s.update-resolv.conf", + pfx)); + } else { + struct netdev *netdev; + bool in_band; + int error; + + in_band = (!cfg_is_valid(CFG_BOOL | CFG_REQUIRED, + "%s.in-band", pfx) + || cfg_get_bool(0, "%s.in-band", pfx)); + ofproto_set_discovery(br->ofproto, false, NULL, NULL); + ofproto_set_in_band(br->ofproto, in_band); + + error = netdev_open(br->name, NETDEV_ETH_TYPE_NONE, &netdev); + if (!error) { + if (cfg_is_valid(CFG_IP | CFG_REQUIRED, "%s.ip", pfx)) { + struct in_addr ip, mask, gateway; + ip.s_addr = cfg_get_ip(0, "%s.ip", pfx); + mask.s_addr = cfg_get_ip(0, "%s.netmask", pfx); + gateway.s_addr = cfg_get_ip(0, "%s.gateway", pfx); + + netdev_turn_flags_on(netdev, NETDEV_UP, true); + if (!mask.s_addr) { + mask.s_addr = guess_netmask(ip.s_addr); + } + if (!netdev_set_in4(netdev, ip, mask)) { + VLOG_INFO("bridge %s: configured IP address "IP_FMT", " + "netmask "IP_FMT, + br->name, IP_ARGS(&ip.s_addr), + IP_ARGS(&mask.s_addr)); + } + + if (gateway.s_addr) { + if (!netdev_add_router(gateway)) { + VLOG_INFO("bridge %s: configured gateway "IP_FMT, + br->name, IP_ARGS(&gateway.s_addr)); + } + } + } + netdev_close(netdev); + } + } + + fail_mode = cfg_get_string(0, "%s.fail-mode", pfx); + if (!fail_mode) { + fail_mode = cfg_get_string(0, "mgmt.fail-mode"); + } + ofproto_set_failure(br->ofproto, + (!fail_mode + || !strcmp(fail_mode, "standalone") + || !strcmp(fail_mode, "open"))); + + probe = cfg_get_int(0, "%s.inactivity-probe", pfx); + ofproto_set_probe_interval(br->ofproto, + probe ? probe : cfg_get_int(0, "mgmt.inactivity-probe")); + + max_backoff = cfg_get_int(0, "%s.max-backoff", pfx); + if (!max_backoff) { + max_backoff = cfg_get_int(0, "mgmt.max-backoff"); + if (!max_backoff) { + max_backoff = 15; + } + } + ofproto_set_max_backoff(br->ofproto, max_backoff); + + rate_limit = cfg_get_int(0, "%s.rate-limit", pfx); + if (!rate_limit) { + rate_limit = cfg_get_int(0, "mgmt.rate-limit"); + } + burst_limit = cfg_get_int(0, "%s.burst-limit", pfx); + if (!burst_limit) { + burst_limit = cfg_get_int(0, "mgmt.burst-limit"); + } + ofproto_set_rate_limit(br->ofproto, rate_limit, burst_limit); + + ofproto_set_stp(br->ofproto, cfg_get_bool(0, "%s.stp", pfx)); + + if (cfg_has("%s.commands.acl", pfx)) { + struct svec command_acls; + char *command_acl; + + svec_init(&command_acls); + cfg_get_all_strings(&command_acls, "%s.commands.acl", pfx); + command_acl = svec_join(&command_acls, ",", ""); + + ofproto_set_remote_execution(br->ofproto, command_acl, + cfg_get_string(0, "%s.commands.dir", + pfx)); + + svec_destroy(&command_acls); + free(command_acl); + } else { + ofproto_set_remote_execution(br->ofproto, NULL, NULL); + } + } else { + union ofp_action action; + flow_t flow; + + /* Set up a flow that matches every packet and directs them to + * OFPP_NORMAL (which goes to us). */ + memset(&action, 0, sizeof action); + action.type = htons(OFPAT_OUTPUT); + action.output.len = htons(sizeof action); + action.output.port = htons(OFPP_NORMAL); + memset(&flow, 0, sizeof flow); + ofproto_add_flow(br->ofproto, &flow, OFPFW_ALL, 0, + &action, 1, 0); + + ofproto_set_in_band(br->ofproto, false); + ofproto_set_max_backoff(br->ofproto, 1); + ofproto_set_probe_interval(br->ofproto, 5); + ofproto_set_failure(br->ofproto, false); + ofproto_set_stp(br->ofproto, false); + } + free(pfx); + + ofproto_set_controller(br->ofproto, br->controller); +} + +static void +bridge_get_all_ifaces(const struct bridge *br, struct svec *ifaces) +{ + size_t i, j; + + svec_init(ifaces); + for (i = 0; i < br->n_ports; i++) { + struct port *port = br->ports[i]; + for (j = 0; j < port->n_ifaces; j++) { + struct iface *iface = port->ifaces[j]; + svec_add(ifaces, iface->name); + } + } + svec_sort(ifaces); + assert(svec_is_unique(ifaces)); +} + +/* For robustness, in case the administrator moves around datapath ports behind + * our back, we re-check all the datapath port numbers here. + * + * This function will set the 'dp_ifidx' members of interfaces that have + * disappeared to -1, so only call this function from a context where those + * 'struct iface's will be removed from the bridge. Otherwise, the -1 + * 'dp_ifidx'es will cause trouble later when we try to send them to the + * datapath, which doesn't support UINT16_MAX+1 ports. */ +static void +bridge_fetch_dp_ifaces(struct bridge *br) +{ + struct odp_port *dpif_ports; + size_t n_dpif_ports; + size_t i, j; + + /* Reset all interface numbers. */ + for (i = 0; i < br->n_ports; i++) { + struct port *port = br->ports[i]; + for (j = 0; j < port->n_ifaces; j++) { + struct iface *iface = port->ifaces[j]; + iface->dp_ifidx = -1; + } + } + port_array_clear(&br->ifaces); + + dpif_port_list(&br->dpif, &dpif_ports, &n_dpif_ports); + for (i = 0; i < n_dpif_ports; i++) { + struct odp_port *p = &dpif_ports[i]; + struct iface *iface = iface_lookup(br, p->devname); + if (iface) { + if (iface->dp_ifidx >= 0) { + VLOG_WARN("dp%u reported interface %s twice", + dpif_id(&br->dpif), p->devname); + } else if (iface_from_dp_ifidx(br, p->port)) { + VLOG_WARN("dp%u reported interface %"PRIu16" twice", + dpif_id(&br->dpif), p->port); + } else { + port_array_set(&br->ifaces, p->port, iface); + iface->dp_ifidx = p->port; + } + } + } + free(dpif_ports); +} + +/* Bridge packet processing functions. */ + +static struct bond_entry * +lookup_bond_entry(const struct port *port, const uint8_t mac[ETH_ADDR_LEN]) +{ + size_t h = hash_bytes(mac, ETH_ADDR_LEN, 0); + return &port->bond_hash[h & BOND_MASK]; +} + +static int +bond_choose_iface(const struct port *port) +{ + size_t i; + for (i = 0; i < port->n_ifaces; i++) { + if (port->ifaces[i]->enabled) { + return i; + } + } + return -1; +} + +static bool +choose_output_iface(const struct port *port, const flow_t *flow, + uint16_t *dp_ifidx, tag_type *tags) +{ + struct iface *iface; + + assert(port->n_ifaces); + if (port->n_ifaces == 1) { + iface = port->ifaces[0]; + } else { + struct bond_entry *e = lookup_bond_entry(port, flow->dl_src); + if (e->iface_idx < 0 || e->iface_idx >= port->n_ifaces + || !port->ifaces[e->iface_idx]->enabled) { + /* XXX select interface properly. The current interface selection + * is only good for testing the rebalancing code. */ + e->iface_idx = bond_choose_iface(port); + if (e->iface_idx < 0) { + *tags |= port->no_ifaces_tag; + return false; + } + e->iface_tag = tag_create_random(); + } + *tags |= e->iface_tag; + iface = port->ifaces[e->iface_idx]; + } + *dp_ifidx = iface->dp_ifidx; + *tags |= iface->tag; /* Currently only used for bonding. */ + return true; +} + +static void +bond_link_status_update(struct iface *iface, bool carrier) +{ + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20); + struct port *port = iface->port; + + if ((carrier == iface->enabled) == (iface->delay_expires == LLONG_MAX)) { + /* Nothing to do. */ + return; + } + VLOG_INFO_RL(&rl, "interface %s: carrier %s", + iface->name, carrier ? "detected" : "dropped"); + if (carrier == iface->enabled) { + iface->delay_expires = LLONG_MAX; + VLOG_INFO_RL(&rl, "interface %s: will not be %s", + iface->name, carrier ? "disabled" : "enabled"); + } else { + int delay = carrier ? port->updelay : port->downdelay; + iface->delay_expires = time_msec() + delay; + if (delay) { + VLOG_INFO_RL(&rl, + "interface %s: will be %s if it stays %s for %d ms", + iface->name, + carrier ? "enabled" : "disabled", + carrier ? "up" : "down", + delay); + } + } +} + +static void +bond_choose_active_iface(struct port *port) +{ + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20); + + port->active_iface = bond_choose_iface(port); + port->active_iface_tag = tag_create_random(); + if (port->active_iface >= 0) { + VLOG_INFO_RL(&rl, "port %s: active interface is now %s", + port->name, port->ifaces[port->active_iface]->name); + } else { + VLOG_WARN_RL(&rl, "port %s: all ports disabled, no active interface", + port->name); + } +} + +static void +bond_run(struct bridge *br) +{ + size_t i, j; + + for (i = 0; i < br->n_ports; i++) { + struct port *port = br->ports[i]; + if (port->n_ifaces < 2) { + continue; + } + for (j = 0; j < port->n_ifaces; j++) { + struct iface *iface = port->ifaces[j]; + if (time_msec() >= iface->delay_expires) { + iface->delay_expires = LLONG_MAX; + iface->enabled = !iface->enabled; + VLOG_WARN("interface %s: %s", + iface->name, + iface->enabled ? "enabled" : "disabled"); + if (!iface->enabled) { + ofproto_revalidate(br->ofproto, iface->tag); + if (iface->port_ifidx == port->active_iface) { + ofproto_revalidate(br->ofproto, + port->active_iface_tag); + bond_choose_active_iface(port); + } + } else { + if (port->active_iface < 0) { + ofproto_revalidate(br->ofproto, port->no_ifaces_tag); + bond_choose_active_iface(port); + } + iface->tag = tag_create_random(); + } + } + } + } +} + +static void +bond_wait(struct bridge *br) +{ + size_t i, j; + + for (i = 0; i < br->n_ports; i++) { + struct port *port = br->ports[i]; + if (port->n_ifaces < 2) { + continue; + } + for (j = 0; j < port->n_ifaces; j++) { + struct iface *iface = port->ifaces[j]; + if (iface->delay_expires != LLONG_MAX) { + poll_timer_wait(iface->delay_expires - time_msec()); + } + } + } +} + +static bool +set_dst(struct dst *p, const flow_t *flow, + const struct port *in_port, const struct port *out_port, + tag_type *tags) +{ + /* STP handling. + * + * XXX This uses too many tags: any broadcast flow will get one tag per + * destination port, and thus a broadcast on a switch of any size is likely + * to have all tag bits set. We should figure out a way to be smarter. + * + * This is OK when STP is disabled, because stp_state_tag is 0 then. */ + *tags |= out_port->stp_state_tag; + if (!(out_port->stp_state & (STP_DISABLED | STP_FORWARDING))) { + return false; + } + + p->vlan = (out_port->vlan >= 0 ? OFP_VLAN_NONE + : in_port->vlan >= 0 ? in_port->vlan + : ntohs(flow->dl_vlan)); + return choose_output_iface(out_port, flow, &p->dp_ifidx, tags); +} + +static void +swap_dst(struct dst *p, struct dst *q) +{ + struct dst tmp = *p; + *p = *q; + *q = tmp; +} + +/* Moves all the dsts with vlan == 'vlan' to the front of the 'n_dsts' in + * 'dsts'. (This may help performance by reducing the number of VLAN changes + * that we push to the datapath. We could in fact fully sort the array by + * vlan, but in most cases there are at most two different vlan tags so that's + * possibly overkill.) */ +static void +partition_dsts(struct dst *dsts, size_t n_dsts, int vlan) +{ + struct dst *first = dsts; + struct dst *last = dsts + n_dsts; + + while (first != last) { + /* Invariants: + * - All dsts < first have vlan == 'vlan'. + * - All dsts >= last have vlan != 'vlan'. + * - first < last. */ + while (first->vlan == vlan) { + if (++first == last) { + return; + } + } + + /* Same invariants, plus one additional: + * - first->vlan != vlan. + */ + while (last[-1].vlan != vlan) { + if (--last == first) { + return; + } + } + + /* Same invariants, plus one additional: + * - last[-1].vlan == vlan.*/ + swap_dst(first++, --last); + } +} + +static int +mirror_mask_ffs(mirror_mask_t mask) +{ + BUILD_ASSERT_DECL(sizeof(unsigned int) >= sizeof(mask)); + return ffs(mask); +} + +static bool +dst_is_duplicate(const struct dst *dsts, size_t n_dsts, + const struct dst *test) +{ + size_t i; + for (i = 0; i < n_dsts; i++) { + if (dsts[i].vlan == test->vlan && dsts[i].dp_ifidx == test->dp_ifidx) { + return true; + } + } + return false; +} + +static bool +port_trunks_vlan(const struct port *port, uint16_t vlan) +{ + return port->vlan < 0 && bitmap_is_set(port->trunks, vlan); +} + +static bool +port_includes_vlan(const struct port *port, uint16_t vlan) +{ + return vlan == port->vlan || port_trunks_vlan(port, vlan); +} + +static size_t +compose_dsts(const struct bridge *br, const flow_t *flow, uint16_t vlan, + const struct port *in_port, const struct port *out_port, + struct dst dsts[], tag_type *tags) +{ + mirror_mask_t mirrors = in_port->src_mirrors; + struct dst *dst = dsts; + size_t i; + + *tags |= in_port->stp_state_tag; + if (out_port == FLOOD_PORT) { + /* XXX use ODP_FLOOD if no vlans or bonding. */ + /* XXX even better, define each VLAN as a datapath port group */ + for (i = 0; i < br->n_ports; i++) { + struct port *port = br->ports[i]; + if (port != in_port && port_includes_vlan(port, vlan) + && !port->is_mirror_output_port + && set_dst(dst, flow, in_port, port, tags)) { + mirrors |= port->dst_mirrors; + dst++; + } + } + } else if (out_port && set_dst(dst, flow, in_port, out_port, tags)) { + mirrors |= out_port->dst_mirrors; + dst++; + } + + while (mirrors) { + struct mirror *m = br->mirrors[mirror_mask_ffs(mirrors) - 1]; + if (!m->n_vlans || vlan_is_mirrored(m, vlan)) { + if (m->out_port) { + if (set_dst(dst, flow, in_port, m->out_port, tags) + && !dst_is_duplicate(dsts, dst - dsts, dst)) { + dst++; + } + } else { + for (i = 0; i < br->n_ports; i++) { + struct port *port = br->ports[i]; + if (port_includes_vlan(port, m->out_vlan) + && set_dst(dst, flow, in_port, port, tags) + && !dst_is_duplicate(dsts, dst - dsts, dst)) + { + if (port->vlan < 0) { + dst->vlan = m->out_vlan; + } + if (dst->dp_ifidx == flow->in_port + && dst->vlan == vlan) { + /* Don't send out input port on same VLAN. */ + continue; + } + dst++; + } + } + } + } + mirrors &= mirrors - 1; + } + + partition_dsts(dsts, dst - dsts, ntohs(flow->dl_vlan)); + return dst - dsts; +} + +static void UNUSED +print_dsts(const struct dst *dsts, size_t n) +{ + for (; n--; dsts++) { + printf(">p%"PRIu16, dsts->dp_ifidx); + if (dsts->vlan != OFP_VLAN_NONE) { + printf("v%"PRIu16, dsts->vlan); + } + } +} + +static void +compose_actions(struct bridge *br, const flow_t *flow, uint16_t vlan, + const struct port *in_port, const struct port *out_port, + tag_type *tags, struct odp_actions *actions) +{ + struct dst dsts[DP_MAX_PORTS * (MAX_MIRRORS + 1)]; + size_t n_dsts; + const struct dst *p; + uint16_t cur_vlan; + + n_dsts = compose_dsts(br, flow, vlan, in_port, out_port, dsts, tags); + + cur_vlan = ntohs(flow->dl_vlan); + for (p = dsts; p < &dsts[n_dsts]; p++) { + union odp_action *a; + if (p->vlan != cur_vlan) { + if (p->vlan == OFP_VLAN_NONE) { + odp_actions_add(actions, ODPAT_STRIP_VLAN); + } else { + a = odp_actions_add(actions, ODPAT_SET_VLAN_VID); + a->vlan_vid.vlan_vid = htons(p->vlan); + } + cur_vlan = p->vlan; + } + a = odp_actions_add(actions, ODPAT_OUTPUT); + a->output.port = p->dp_ifidx; + } +} + +static bool +is_bcast_arp_reply(const flow_t *flow, const struct ofpbuf *packet) +{ + struct arp_eth_header *arp = (struct arp_eth_header *) packet->data; + return (flow->dl_type == htons(ETH_TYPE_ARP) + && eth_addr_is_broadcast(flow->dl_dst) + && packet->size >= sizeof(struct arp_eth_header) + && arp->ar_op == ARP_OP_REQUEST); +} + +/* If the composed actions may be applied to any packet in the given 'flow', + * returns true. Otherwise, the actions should only be applied to 'packet', or + * not at all, if 'packet' was NULL. */ +static bool +process_flow(struct bridge *br, const flow_t *flow, + const struct ofpbuf *packet, struct odp_actions *actions, + tag_type *tags) +{ + struct iface *in_iface; + struct port *in_port; + struct port *out_port = NULL; /* By default, drop the packet/flow. */ + int vlan; + + /* Find the interface and port structure for the received packet. */ + in_iface = iface_from_dp_ifidx(br, flow->in_port); + if (!in_iface) { + /* No interface? Something fishy... */ + if (packet != NULL) { + /* Odd. A few possible reasons here: + * + * - We deleted an interface but there are still a few packets + * queued up from it. + * + * - Someone externally added an interface (e.g. with "ovs-dpctl + * add-if") that we don't know about. + * + * - Packet arrived on the local port but the local port is not + * one of our bridge ports. + */ + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); + + VLOG_WARN_RL(&rl, "bridge %s: received packet on unknown " + "interface %"PRIu16, br->name, flow->in_port); + } + + /* Return without adding any actions, to drop packets on this flow. */ + return true; + } + in_port = in_iface->port; + + /* Figure out what VLAN this packet belongs to. + * + * Note that dl_vlan of 0 and of OFP_VLAN_NONE both mean that the packet + * belongs to VLAN 0, so we should treat both cases identically. (In the + * former case, the packet has an 802.1Q header that specifies VLAN 0, + * presumably to allow a priority to be specified. In the latter case, the + * packet does not have any 802.1Q header.) */ + vlan = ntohs(flow->dl_vlan); + if (vlan == OFP_VLAN_NONE) { + vlan = 0; + } + if (in_port->vlan >= 0) { + if (vlan) { + /* XXX support double tagging? */ + if (packet != NULL) { + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); + VLOG_WARN_RL(&rl, "bridge %s: dropping VLAN %"PRIu16" tagged " + "packet received on port %s configured with " + "implicit VLAN %"PRIu16, + br->name, ntohs(flow->dl_vlan), + in_port->name, in_port->vlan); + } + goto done; + } + vlan = in_port->vlan; + } else { + if (!port_includes_vlan(in_port, vlan)) { + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); + VLOG_WARN_RL(&rl, "bridge %s: dropping VLAN %d tagged " + "packet received on port %s not configured for " + "trunking VLAN %d", + br->name, vlan, in_port->name, vlan); + goto done; + } + } + + /* Drop frames for ports that STP wants entirely killed (both for + * forwarding and for learning). Later, after we do learning, we'll drop + * the frames that STP wants to do learning but not forwarding on. */ + if (in_port->stp_state & (STP_LISTENING | STP_BLOCKING)) { + goto done; + } + + /* Drop frames for reserved multicast addresses. */ + if (eth_addr_is_reserved(flow->dl_dst)) { + goto done; + } + + /* Drop frames on ports reserved for mirroring. */ + if (in_port->is_mirror_output_port) { + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); + VLOG_WARN_RL(&rl, "bridge %s: dropping packet received on port %s, " + "which is reserved exclusively for mirroring", + br->name, in_port->name); + goto done; + } + + /* Drop multicast and broadcast packets on inactive bonded interfaces, to + * avoid receiving duplicates. */ + if (in_port->n_ifaces > 1 && eth_addr_is_multicast(flow->dl_dst)) { + *tags |= in_port->active_iface_tag; + if (in_port->active_iface != in_iface->port_ifidx) { + goto done; + } + } + + /* MAC learning. */ + out_port = FLOOD_PORT; + if (br->ml) { + int out_port_idx; + bool may_learn; + + if (!packet) { + /* Don't try to learn from revalidation. */ + may_learn = false; + } else if (in_port->n_ifaces > 1) { + /* If the packet arrived on a bonded port, don't learn from it + * unless we haven't learned any port at all for that address + * (because we probably sent the packet on one bonded interface and + * got it back on the other). Broadcast ARP replies are an + * exception to this rule: the host has moved to another switch. */ + int src_idx = mac_learning_lookup(br->ml, flow->dl_src, vlan); + may_learn = (src_idx < 0 + || src_idx == in_port->port_idx + || is_bcast_arp_reply(flow, packet)); + } else { + may_learn = true; + } + + /* Learn source MAC. */ + if (may_learn) { + tag_type rev_tag = mac_learning_learn(br->ml, flow->dl_src, + vlan, in_port->port_idx); + if (rev_tag) { + /* The log messages here could actually be useful in debugging, + * so keep the rate limit relatively high. */ + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(30, + 300); + VLOG_DBG_RL(&rl, "bridge %s: learned that "ETH_ADDR_FMT" is " + "on port %s in VLAN %d", + br->name, ETH_ADDR_ARGS(flow->dl_src), + in_port->name, vlan); + ofproto_revalidate(br->ofproto, rev_tag); + } + } + + /* Determine output port. */ + out_port_idx = mac_learning_lookup_tag(br->ml, flow->dl_dst, vlan, + tags); + if (out_port_idx >= 0 && out_port_idx < br->n_ports) { + out_port = br->ports[out_port_idx]; + } + } + + /* Don't send packets out their input ports. Don't forward frames that STP + * wants us to discard. */ + if (in_port == out_port || in_port->stp_state == STP_LEARNING) { + out_port = NULL; + } + +done: + compose_actions(br, flow, vlan, in_port, out_port, tags, actions); + + /* + * We send out only a single packet, instead of setting up a flow, if the + * packet is an ARP directed to broadcast that arrived on a bonded + * interface. In such a situation ARP requests and replies must be handled + * differently, but OpenFlow unfortunately can't distinguish them. + */ + return (in_port->n_ifaces < 2 + || flow->dl_type != htons(ETH_TYPE_ARP) + || !eth_addr_is_broadcast(flow->dl_dst)); +} + +/* Careful: 'opp' is in host byte order and opp->port_no is an OFP port + * number. */ +static void +bridge_port_changed_ofhook_cb(enum ofp_port_reason reason, + const struct ofp_phy_port *opp, + void *br_) +{ + struct bridge *br = br_; + struct iface *iface; + struct port *port; + + iface = iface_from_dp_ifidx(br, ofp_port_to_odp_port(opp->port_no)); + if (!iface) { + return; + } + port = iface->port; + + if (reason == OFPPR_DELETE) { + VLOG_WARN("bridge %s: interface %s deleted unexpectedly", + br->name, iface->name); + iface_destroy(iface); + if (!port->n_ifaces) { + VLOG_WARN("bridge %s: port %s has no interfaces, dropping", + br->name, port->name); + port_destroy(port); + } + + bridge_flush(br); + } else { + memcpy(iface->mac, opp->hw_addr, ETH_ADDR_LEN); + if (port->n_ifaces > 1) { + bool up = !(opp->state & OFPPS_LINK_DOWN); + bond_link_status_update(iface, up); + port_update_bond_compat(port); + } + } +} + +static bool +bridge_normal_ofhook_cb(const flow_t *flow, const struct ofpbuf *packet, + struct odp_actions *actions, tag_type *tags, void *br_) +{ + struct bridge *br = br_; + +#if 0 + if (flow->dl_type == htons(OFP_DL_TYPE_NOT_ETH_TYPE) + && eth_addr_equals(flow->dl_dst, stp_eth_addr)) { + brstp_receive(br, flow, payload); + return true; + } +#endif + + COVERAGE_INC(bridge_process_flow); + return process_flow(br, flow, packet, actions, tags); +} + +static void +bridge_account_flow_ofhook_cb(const flow_t *flow, + const union odp_action *actions, + size_t n_actions, unsigned long long int n_bytes, + void *br_) +{ + struct bridge *br = br_; + const union odp_action *a; + + if (!br->has_bonded_ports) { + return; + } + + for (a = actions; a < &actions[n_actions]; a++) { + if (a->type == ODPAT_OUTPUT) { + struct port *port = port_from_dp_ifidx(br, a->output.port); + if (port && port->n_ifaces >= 2) { + struct bond_entry *e = lookup_bond_entry(port, flow->dl_src); + e->tx_bytes += n_bytes; + } + } + } +} + +static void +bridge_account_checkpoint_ofhook_cb(void *br_) +{ + struct bridge *br = br_; + size_t i; + + if (!br->has_bonded_ports) { + return; + } + + /* The current ofproto implementation calls this callback at least once a + * second, so this timer implementation is sufficient. */ + if (time_msec() < br->bond_next_rebalance) { + return; + } + br->bond_next_rebalance = time_msec() + 10000; + + for (i = 0; i < br->n_ports; i++) { + struct port *port = br->ports[i]; + if (port->n_ifaces > 1) { + bond_rebalance_port(port); + } + } +} + +static struct ofhooks bridge_ofhooks = { + bridge_port_changed_ofhook_cb, + bridge_normal_ofhook_cb, + bridge_account_flow_ofhook_cb, + bridge_account_checkpoint_ofhook_cb, +}; + +/* Statistics for a single interface on a bonded port, used for load-based + * bond rebalancing. */ +struct slave_balance { + struct iface *iface; /* The interface. */ + uint64_t tx_bytes; /* Sum of hashes[*]->tx_bytes. */ + + /* All the "bond_entry"s that are assigned to this interface, in order of + * increasing tx_bytes. */ + struct bond_entry **hashes; + size_t n_hashes; +}; + +/* Sorts pointers to pointers to bond_entries in ascending order by the + * interface to which they are assigned, and within a single interface in + * ascending order of bytes transmitted. */ +static int +compare_bond_entries(const void *a_, const void *b_) +{ + const struct bond_entry *const *ap = a_; + const struct bond_entry *const *bp = b_; + const struct bond_entry *a = *ap; + const struct bond_entry *b = *bp; + if (a->iface_idx != b->iface_idx) { + return a->iface_idx > b->iface_idx ? 1 : -1; + } else if (a->tx_bytes != b->tx_bytes) { + return a->tx_bytes > b->tx_bytes ? 1 : -1; + } else { + return 0; + } +} + +/* Sorts slave_balances so that enabled ports come first, and otherwise in + * *descending* order by number of bytes transmitted. */ +static int +compare_slave_balance(const void *a_, const void *b_) +{ + const struct slave_balance *a = a_; + const struct slave_balance *b = b_; + if (a->iface->enabled != b->iface->enabled) { + return a->iface->enabled ? -1 : 1; + } else if (a->tx_bytes != b->tx_bytes) { + return a->tx_bytes > b->tx_bytes ? -1 : 1; + } else { + return 0; + } +} + +static void +swap_bals(struct slave_balance *a, struct slave_balance *b) +{ + struct slave_balance tmp = *a; + *a = *b; + *b = tmp; +} + +/* Restores the 'n_bals' slave_balance structures in 'bals' to sorted order + * given that 'p' (and only 'p') might be in the wrong location. + * + * This function invalidates 'p', since it might now be in a different memory + * location. */ +static void +resort_bals(struct slave_balance *p, + struct slave_balance bals[], size_t n_bals) +{ + if (n_bals > 1) { + for (; p > bals && p->tx_bytes > p[-1].tx_bytes; p--) { + swap_bals(p, p - 1); + } + for (; p < &bals[n_bals - 1] && p->tx_bytes < p[1].tx_bytes; p++) { + swap_bals(p, p + 1); + } + } +} + +static void +log_bals(const struct slave_balance *bals, size_t n_bals, struct port *port) +{ + if (VLOG_IS_DBG_ENABLED()) { + struct ds ds = DS_EMPTY_INITIALIZER; + const struct slave_balance *b; + + for (b = bals; b < bals + n_bals; b++) { + size_t i; + + if (b > bals) { + ds_put_char(&ds, ','); + } + ds_put_format(&ds, " %s %"PRIu64"kB", + b->iface->name, b->tx_bytes / 1024); + + if (!b->iface->enabled) { + ds_put_cstr(&ds, " (disabled)"); + } + if (b->n_hashes > 0) { + ds_put_cstr(&ds, " ("); + for (i = 0; i < b->n_hashes; i++) { + const struct bond_entry *e = b->hashes[i]; + if (i > 0) { + ds_put_cstr(&ds, " + "); + } + ds_put_format(&ds, "h%td: %"PRIu64"kB", + e - port->bond_hash, e->tx_bytes / 1024); + } + ds_put_cstr(&ds, ")"); + } + } + VLOG_DBG("bond %s:%s", port->name, ds_cstr(&ds)); + ds_destroy(&ds); + } +} + +/* Shifts 'hash' from 'from' to 'to' within 'port'. */ +static void +bond_shift_load(struct slave_balance *from, struct slave_balance *to, + struct bond_entry *hash) +{ + struct port *port = from->iface->port; + uint64_t delta = hash->tx_bytes; + + VLOG_INFO("bond %s: shift %"PRIu64"kB of load (with hash %td) " + "from %s to %s (now carrying %"PRIu64"kB and " + "%"PRIu64"kB load, respectively)", + port->name, delta / 1024, hash - port->bond_hash, + from->iface->name, to->iface->name, + (from->tx_bytes - delta) / 1024, + (to->tx_bytes + delta) / 1024); + + /* Delete element from from->hashes. + * + * We don't bother to add the element to to->hashes because not only would + * it require more work, the only purpose it would be to allow that hash to + * be migrated to another slave in this rebalancing run, and there is no + * point in doing that. */ + if (from->hashes[0] == hash) { + from->hashes++; + } else { + int i = hash - from->hashes[0]; + memmove(from->hashes + i, from->hashes + i + 1, + (from->n_hashes - (i + 1)) * sizeof *from->hashes); + } + from->n_hashes--; + + /* Shift load away from 'from' to 'to'. */ + from->tx_bytes -= delta; + to->tx_bytes += delta; + + /* Arrange for flows to be revalidated. */ + ofproto_revalidate(port->bridge->ofproto, hash->iface_tag); + hash->iface_idx = to->iface->port_ifidx; + hash->iface_tag = tag_create_random(); + +} + +static void +bond_rebalance_port(struct port *port) +{ + struct slave_balance bals[DP_MAX_PORTS]; + size_t n_bals; + struct bond_entry *hashes[BOND_MASK + 1]; + struct slave_balance *b, *from, *to; + struct bond_entry *e; + size_t i; + + /* Sets up 'bals' to describe each of the port's interfaces, sorted in + * descending order of tx_bytes, so that bals[0] represents the most + * heavily loaded slave and bals[n_bals - 1] represents the least heavily + * loaded slave. + * + * The code is a bit tricky: to avoid dynamically allocating a 'hashes' + * array for each slave_balance structure, we sort our local array of + * hashes in order by slave, so that all of the hashes for a given slave + * become contiguous in memory, and then we point each 'hashes' members of + * a slave_balance structure to the start of a contiguous group. */ + n_bals = port->n_ifaces; + for (b = bals; b < &bals[n_bals]; b++) { + b->iface = port->ifaces[b - bals]; + b->tx_bytes = 0; + b->hashes = NULL; + b->n_hashes = 0; + } + for (i = 0; i <= BOND_MASK; i++) { + hashes[i] = &port->bond_hash[i]; + } + qsort(hashes, BOND_MASK + 1, sizeof *hashes, compare_bond_entries); + for (i = 0; i <= BOND_MASK; i++) { + e = hashes[i]; + if (e->iface_idx >= 0 && e->iface_idx < port->n_ifaces) { + b = &bals[e->iface_idx]; + b->tx_bytes += e->tx_bytes; + if (!b->hashes) { + b->hashes = &hashes[i]; + } + b->n_hashes++; + } + } + qsort(bals, n_bals, sizeof *bals, compare_slave_balance); + log_bals(bals, n_bals, port); + + /* Discard slaves that aren't enabled (which were sorted to the back of the + * array earlier). */ + while (!bals[n_bals - 1].iface->enabled) { + n_bals--; + if (!n_bals) { + return; + } + } + + /* Shift load from the most-loaded slaves to the least-loaded slaves. */ + to = &bals[n_bals - 1]; + for (from = bals; from < to; ) { + uint64_t overload = from->tx_bytes - to->tx_bytes; + if (overload < to->tx_bytes >> 5 || overload < 100000) { + /* The extra load on 'from' (and all less-loaded slaves), compared + * to that of 'to' (the least-loaded slave), is less than ~3%, or + * it is less than ~1Mbps. No point in rebalancing. */ + break; + } else if (from->n_hashes == 1) { + /* 'from' only carries a single MAC hash, so we can't shift any + * load away from it, even though we want to. */ + from++; + } else { + /* 'from' is carrying significantly more load than 'to', and that + * load is split across at least two different hashes. Pick a hash + * to migrate to 'to' (the least-loaded slave), given that doing so + * must not cause 'to''s load to exceed 'from''s load. + * + * The sort order we use means that we prefer to shift away the + * smallest hashes instead of the biggest ones. There is little + * reason behind this decision; we could use the opposite sort + * order to shift away big hashes ahead of small ones. */ + size_t i; + + for (i = 0; i < from->n_hashes; i++) { + uint64_t delta = from->hashes[i]->tx_bytes; + if (to->tx_bytes + delta < from->tx_bytes - delta) { + break; + } + } + if (i < from->n_hashes) { + bond_shift_load(from, to, from->hashes[i]); + + /* Re-sort 'bals'. Note that this may make 'from' and 'to' + * point to different slave_balance structures. It is only + * valid to do these two operations in a row at all because we + * know that 'from' will not move past 'to' and vice versa. */ + resort_bals(from, bals, n_bals); + resort_bals(to, bals, n_bals); + } else { + from++; + } + } + } + + /* Implement exponentially weighted moving average. A weight of 1/2 causes + * historical data to decay to <1% in 7 rebalancing runs. */ + for (e = &port->bond_hash[0]; e <= &port->bond_hash[BOND_MASK]; e++) { + e->tx_bytes /= 2; + } +} + +/* Port functions. */ + +static void +port_create(struct bridge *br, const char *name) +{ + struct port *port; + + port = xcalloc(1, sizeof *port); + port->bridge = br; + port->port_idx = br->n_ports; + port->vlan = -1; + port->trunks = NULL; + port->name = xstrdup(name); + port->active_iface = -1; + port->stp_state = STP_DISABLED; + port->stp_state_tag = 0; + + if (br->n_ports >= br->allocated_ports) { + br->ports = x2nrealloc(br->ports, &br->allocated_ports, + sizeof *br->ports); + } + br->ports[br->n_ports++] = port; + + VLOG_INFO("created port %s on bridge %s", port->name, br->name); + bridge_flush(br); +} + +static void +port_reconfigure(struct port *port) +{ + bool bonded = cfg_has_section("bonding.%s", port->name); + struct svec old_ifaces, new_ifaces; + unsigned long *trunks; + int vlan; + size_t i; + + /* Collect old and new interfaces. */ + svec_init(&old_ifaces); + svec_init(&new_ifaces); + for (i = 0; i < port->n_ifaces; i++) { + svec_add(&old_ifaces, port->ifaces[i]->name); + } + svec_sort(&old_ifaces); + if (bonded) { + cfg_get_all_keys(&new_ifaces, "bonding.%s.slave", port->name); + if (!new_ifaces.n) { + VLOG_ERR("port %s: no interfaces specified for bonded port", + port->name); + } else if (new_ifaces.n == 1) { + VLOG_WARN("port %s: only 1 interface specified for bonded port", + port->name); + } + + port->updelay = cfg_get_int(0, "bonding.%s.updelay", port->name); + if (port->updelay < 0) { + port->updelay = 0; + } + port->downdelay = cfg_get_int(0, "bonding.%s.downdelay", port->name); + if (port->downdelay < 0) { + port->downdelay = 0; + } + } else { + svec_init(&new_ifaces); + svec_add(&new_ifaces, port->name); + } + + /* Get rid of deleted interfaces and add new interfaces. */ + for (i = 0; i < port->n_ifaces; i++) { + struct iface *iface = port->ifaces[i]; + if (!svec_contains(&new_ifaces, iface->name)) { + iface_destroy(iface); + } else { + i++; + } + } + for (i = 0; i < new_ifaces.n; i++) { + const char *name = new_ifaces.names[i]; + if (!svec_contains(&old_ifaces, name)) { + iface_create(port, name); + } + } + + /* Get VLAN tag. */ + vlan = -1; + if (cfg_has("vlan.%s.tag", port->name)) { + if (!bonded) { + vlan = cfg_get_vlan(0, "vlan.%s.tag", port->name); + if (vlan >= 0 && vlan <= 4095) { + VLOG_DBG("port %s: assigning VLAN tag %d", port->name, vlan); + } + } else { + /* It's possible that bonded, VLAN-tagged ports make sense. Maybe + * they even work as-is. But they have not been tested. */ + VLOG_WARN("port %s: VLAN tags not supported on bonded ports", + port->name); + } + } + if (port->vlan != vlan) { + port->vlan = vlan; + bridge_flush(port->bridge); + } + + /* Get trunked VLANs. */ + trunks = NULL; + if (vlan < 0) { + size_t n_trunks, n_errors; + size_t i; + + trunks = bitmap_allocate(4096); + n_trunks = cfg_count("vlan.%s.trunks", port->name); + n_errors = 0; + for (i = 0; i < n_trunks; i++) { + int trunk = cfg_get_vlan(i, "vlan.%s.trunks", port->name); + if (trunk >= 0) { + bitmap_set1(trunks, trunk); + } else { + n_errors++; + } + } + if (n_errors) { + VLOG_ERR("port %s: invalid values for %zu trunk VLANs", + port->name, n_trunks); + } + if (n_errors == n_trunks) { + if (n_errors) { + VLOG_ERR("port %s: no valid trunks, trunking all VLANs", + port->name); + } + bitmap_set_multiple(trunks, 0, 4096, 1); + } + } else { + if (cfg_has("vlan.%s.trunks", port->name)) { + VLOG_ERR("ignoring vlan.%s.trunks in favor of vlan.%s.vlan", + port->name, port->name); + } + } + if (trunks == NULL + ? port->trunks != NULL + : port->trunks == NULL || !bitmap_equal(trunks, port->trunks, 4096)) { + bridge_flush(port->bridge); + } + bitmap_free(port->trunks); + port->trunks = trunks; + + svec_destroy(&old_ifaces); + svec_destroy(&new_ifaces); +} + +static void +port_destroy(struct port *port) +{ + if (port) { + struct bridge *br = port->bridge; + struct port *del; + size_t i; + + proc_net_compat_update_vlan(port->name, NULL, 0); + + for (i = 0; i < MAX_MIRRORS; i++) { + struct mirror *m = br->mirrors[i]; + if (m && m->out_port == port) { + mirror_destroy(m); + } + } + + while (port->n_ifaces > 0) { + iface_destroy(port->ifaces[port->n_ifaces - 1]); + } + + del = br->ports[port->port_idx] = br->ports[--br->n_ports]; + del->port_idx = port->port_idx; + + free(port->ifaces); + bitmap_free(port->trunks); + free(port->name); + free(port); + bridge_flush(br); + } +} + +static struct port * +port_from_dp_ifidx(const struct bridge *br, uint16_t dp_ifidx) +{ + struct iface *iface = iface_from_dp_ifidx(br, dp_ifidx); + return iface ? iface->port : NULL; +} + +static struct port * +port_lookup(const struct bridge *br, const char *name) +{ + size_t i; + + for (i = 0; i < br->n_ports; i++) { + struct port *port = br->ports[i]; + if (!strcmp(port->name, name)) { + return port; + } + } + return NULL; +} + +static void +port_update_bonding(struct port *port) +{ + if (port->n_ifaces < 2) { + /* Not a bonded port. */ + if (port->bond_hash) { + free(port->bond_hash); + port->bond_hash = NULL; + proc_net_compat_update_bond(port->name, NULL); + } + } else { + if (!port->bond_hash) { + size_t i; + + port->bond_hash = xcalloc(BOND_MASK + 1, sizeof *port->bond_hash); + for (i = 0; i <= BOND_MASK; i++) { + struct bond_entry *e = &port->bond_hash[i]; + e->iface_idx = -1; + e->tx_bytes = 0; + } + port->no_ifaces_tag = tag_create_random(); + bond_choose_active_iface(port); + } + port_update_bond_compat(port); + } +} + +static void +port_update_bond_compat(struct port *port) +{ + struct compat_bond bond; + size_t i; + + if (port->n_ifaces < 2) { + return; + } + + bond.up = false; + bond.updelay = port->updelay; + bond.downdelay = port->downdelay; + bond.n_slaves = port->n_ifaces; + bond.slaves = xmalloc(port->n_ifaces * sizeof *bond.slaves); + for (i = 0; i < port->n_ifaces; i++) { + struct iface *iface = port->ifaces[i]; + struct compat_bond_slave *slave = &bond.slaves[i]; + slave->name = iface->name; + slave->up = ((iface->enabled && iface->delay_expires == LLONG_MAX) || + (!iface->enabled && iface->delay_expires != LLONG_MAX)); + if (slave->up) { + bond.up = true; + } + memcpy(slave->mac, iface->mac, ETH_ADDR_LEN); + } + proc_net_compat_update_bond(port->name, &bond); + free(bond.slaves); +} + +static void +port_update_vlan_compat(struct port *port) +{ + struct bridge *br = port->bridge; + char *vlandev_name = NULL; + + if (port->vlan > 0) { + /* Figure out the name that the VLAN device should actually have, if it + * existed. This takes some work because the VLAN device would not + * have port->name in its name; rather, it would have the trunk port's + * name, and 'port' would be attached to a bridge that also had the + * VLAN device one of its ports. So we need to find a trunk port that + * includes port->vlan. + * + * There might be more than one candidate. This doesn't happen on + * XenServer, so if it happens we just pick the first choice in + * alphabetical order instead of creating multiple VLAN devices. */ + size_t i; + for (i = 0; i < br->n_ports; i++) { + struct port *p = br->ports[i]; + if (port_trunks_vlan(p, port->vlan) + && p->n_ifaces + && (!vlandev_name || strcmp(p->name, vlandev_name) <= 0)) + { + const uint8_t *ea = p->ifaces[0]->mac; + if (!eth_addr_is_multicast(ea) && + !eth_addr_is_reserved(ea) && + !eth_addr_is_zero(ea)) { + vlandev_name = p->name; + } + } + } + } + proc_net_compat_update_vlan(port->name, vlandev_name, port->vlan); +} + +/* Interface functions. */ + +static void +iface_create(struct port *port, const char *name) +{ + enum netdev_flags flags; + struct iface *iface; + + iface = xcalloc(1, sizeof *iface); + iface->port = port; + iface->port_ifidx = port->n_ifaces; + iface->name = xstrdup(name); + iface->dp_ifidx = -1; + iface->tag = tag_create_random(); + iface->enabled = true; + iface->delay_expires = LLONG_MAX; + + netdev_nodev_get_etheraddr(name, iface->mac); + + if (!netdev_nodev_get_flags(name, &flags)) { + iface->enabled = (flags & NETDEV_UP) != 0; + } + + if (port->n_ifaces >= port->allocated_ifaces) { + port->ifaces = x2nrealloc(port->ifaces, &port->allocated_ifaces, + sizeof *port->ifaces); + } + port->ifaces[port->n_ifaces++] = iface; + if (port->n_ifaces > 1) { + port->bridge->has_bonded_ports = true; + } + + VLOG_DBG("attached network device %s to port %s", iface->name, port->name); + + port_update_bonding(port); + bridge_flush(port->bridge); +} + +static void +iface_destroy(struct iface *iface) +{ + if (iface) { + struct port *port = iface->port; + struct bridge *br = port->bridge; + bool del_active = port->active_iface == iface->port_ifidx; + struct iface *del; + + if (iface->dp_ifidx >= 0) { + port_array_set(&br->ifaces, iface->dp_ifidx, NULL); + } + + del = port->ifaces[iface->port_ifidx] = port->ifaces[--port->n_ifaces]; + del->port_ifidx = iface->port_ifidx; + + free(iface->name); + free(iface); + + if (del_active) { + ofproto_revalidate(port->bridge->ofproto, port->active_iface_tag); + bond_choose_active_iface(port); + } + + port_update_bonding(port); + bridge_flush(port->bridge); + } +} + +static struct iface * +iface_lookup(const struct bridge *br, const char *name) +{ + size_t i, j; + + for (i = 0; i < br->n_ports; i++) { + struct port *port = br->ports[i]; + for (j = 0; j < port->n_ifaces; j++) { + struct iface *iface = port->ifaces[j]; + if (!strcmp(iface->name, name)) { + return iface; + } + } + } + return NULL; +} + +static struct iface * +iface_from_dp_ifidx(const struct bridge *br, uint16_t dp_ifidx) +{ + return port_array_get(&br->ifaces, dp_ifidx); +} + +/* Port mirroring. */ + +static void +mirror_reconfigure(struct bridge *br) +{ + struct svec old_mirrors, new_mirrors; + size_t i; + + /* Collect old and new mirrors. */ + svec_init(&old_mirrors); + svec_init(&new_mirrors); + cfg_get_subsections(&new_mirrors, "mirror.%s", br->name); + for (i = 0; i < MAX_MIRRORS; i++) { + if (br->mirrors[i]) { + svec_add(&old_mirrors, br->mirrors[i]->name); + } + } + + /* Get rid of deleted mirrors and add new mirrors. */ + svec_sort(&old_mirrors); + assert(svec_is_unique(&old_mirrors)); + svec_sort(&new_mirrors); + assert(svec_is_unique(&new_mirrors)); + for (i = 0; i < MAX_MIRRORS; i++) { + struct mirror *m = br->mirrors[i]; + if (m && !svec_contains(&new_mirrors, m->name)) { + mirror_destroy(m); + } + } + for (i = 0; i < new_mirrors.n; i++) { + const char *name = new_mirrors.names[i]; + if (!svec_contains(&old_mirrors, name)) { + mirror_create(br, name); + } + } + svec_destroy(&old_mirrors); + svec_destroy(&new_mirrors); + + /* Reconfigure all mirrors. */ + for (i = 0; i < MAX_MIRRORS; i++) { + if (br->mirrors[i]) { + mirror_reconfigure_one(br->mirrors[i]); + } + } + + /* Update port reserved status. */ + for (i = 0; i < br->n_ports; i++) { + br->ports[i]->is_mirror_output_port = false; + } + for (i = 0; i < MAX_MIRRORS; i++) { + struct mirror *m = br->mirrors[i]; + if (m && m->out_port) { + m->out_port->is_mirror_output_port = true; + } + } +} + +static void +mirror_create(struct bridge *br, const char *name) +{ + struct mirror *m; + size_t i; + + for (i = 0; ; i++) { + if (i >= MAX_MIRRORS) { + VLOG_WARN("bridge %s: maximum of %d port mirrors reached, " + "cannot create %s", br->name, MAX_MIRRORS, name); + return; + } + if (!br->mirrors[i]) { + break; + } + } + + VLOG_INFO("created port mirror %s on bridge %s", name, br->name); + bridge_flush(br); + + br->mirrors[i] = m = xcalloc(1, sizeof *m); + m->bridge = br; + m->idx = i; + m->name = xstrdup(name); + svec_init(&m->src_ports); + svec_init(&m->dst_ports); + m->vlans = NULL; + m->n_vlans = 0; + m->out_vlan = -1; + m->out_port = NULL; +} + +static void +mirror_destroy(struct mirror *m) +{ + if (m) { + struct bridge *br = m->bridge; + size_t i; + + for (i = 0; i < br->n_ports; i++) { + br->ports[i]->src_mirrors &= ~(MIRROR_MASK_C(1) << m->idx); + br->ports[i]->dst_mirrors &= ~(MIRROR_MASK_C(1) << m->idx); + } + + svec_destroy(&m->src_ports); + svec_destroy(&m->dst_ports); + free(m->vlans); + + m->bridge->mirrors[m->idx] = NULL; + free(m); + + bridge_flush(br); + } +} + +static void +prune_ports(struct mirror *m, struct svec *ports) +{ + struct svec tmp; + size_t i; + + svec_sort_unique(ports); + + svec_init(&tmp); + for (i = 0; i < ports->n; i++) { + const char *name = ports->names[i]; + if (port_lookup(m->bridge, name)) { + svec_add(&tmp, name); + } else { + VLOG_WARN("mirror.%s.%s: cannot match on nonexistent port %s", + m->bridge->name, m->name, name); + } + } + svec_swap(ports, &tmp); + svec_destroy(&tmp); +} + +static size_t +prune_vlans(struct mirror *m, struct svec *vlan_strings, int **vlans) +{ + size_t n_vlans, i; + + /* This isn't perfect: it won't combine "0" and "00", and the textual sort + * order won't give us numeric sort order. But that's good enough for what + * we need right now. */ + svec_sort_unique(vlan_strings); + + *vlans = xmalloc(sizeof *vlans * vlan_strings->n); + n_vlans = 0; + for (i = 0; i < vlan_strings->n; i++) { + const char *name = vlan_strings->names[i]; + int vlan; + if (!str_to_int(name, 10, &vlan) || vlan < 0 || vlan > 4095) { + VLOG_WARN("mirror.%s.%s.select.vlan: ignoring invalid VLAN %s", + m->bridge->name, m->name, name); + } else { + (*vlans)[n_vlans++] = vlan; + } + } + return n_vlans; +} + +static bool +vlan_is_mirrored(const struct mirror *m, int vlan) +{ + size_t i; + + for (i = 0; i < m->n_vlans; i++) { + if (m->vlans[i] == vlan) { + return true; + } + } + return false; +} + +static bool +port_trunks_any_mirrored_vlan(const struct mirror *m, const struct port *p) +{ + size_t i; + + for (i = 0; i < m->n_vlans; i++) { + if (port_trunks_vlan(p, m->vlans[i])) { + return true; + } + } + return false; +} + +static void +mirror_reconfigure_one(struct mirror *m) +{ + char *pfx = xasprintf("mirror.%s.%s", m->bridge->name, m->name); + struct svec src_ports, dst_ports, ports; + struct svec vlan_strings; + mirror_mask_t mirror_bit; + const char *out_port_name; + struct port *out_port; + int out_vlan; + size_t n_vlans; + int *vlans; + size_t i; + bool mirror_all_ports; + + /* Get output port. */ + out_port_name = cfg_get_key(0, "mirror.%s.%s.output.port", + m->bridge->name, m->name); + if (out_port_name) { + out_port = port_lookup(m->bridge, out_port_name); + if (!out_port) { + VLOG_ERR("%s.output.port: bridge %s does not have a port " + "named %s", pfx, m->bridge->name, out_port_name); + mirror_destroy(m); + free(pfx); + return; + } + out_vlan = -1; + + if (cfg_has("%s.output.vlan", pfx)) { + VLOG_ERR("%s.output.port and %s.output.vlan both specified; " + "ignoring %s.output.vlan", pfx, pfx, pfx); + } + } else if (cfg_has("%s.output.vlan", pfx)) { + out_port = NULL; + out_vlan = cfg_get_vlan(0, "%s.output.vlan", pfx); + } else { + VLOG_ERR("%s: neither %s.output.port nor %s.output.vlan specified, " + "but exactly one is required; disabling port mirror %s", + pfx, pfx, pfx, pfx); + mirror_destroy(m); + free(pfx); + return; + } + + /* Get all the ports, and drop duplicates and ports that don't exist. */ + svec_init(&src_ports); + svec_init(&dst_ports); + svec_init(&ports); + cfg_get_all_keys(&src_ports, "%s.select.src-port", pfx); + cfg_get_all_keys(&dst_ports, "%s.select.dst-port", pfx); + cfg_get_all_keys(&ports, "%s.select.port", pfx); + svec_append(&src_ports, &ports); + svec_append(&dst_ports, &ports); + svec_destroy(&ports); + prune_ports(m, &src_ports); + prune_ports(m, &dst_ports); + + /* Get all the vlans, and drop duplicate and invalid vlans. */ + svec_init(&vlan_strings); + cfg_get_all_keys(&vlan_strings, "%s.select.vlan", pfx); + n_vlans = prune_vlans(m, &vlan_strings, &vlans); + svec_destroy(&vlan_strings); + + /* Update mirror data. */ + if (!svec_equal(&m->src_ports, &src_ports) + || !svec_equal(&m->dst_ports, &dst_ports) + || m->n_vlans != n_vlans + || memcmp(m->vlans, vlans, sizeof *vlans * n_vlans) + || m->out_port != out_port + || m->out_vlan != out_vlan) { + bridge_flush(m->bridge); + } + svec_swap(&m->src_ports, &src_ports); + svec_swap(&m->dst_ports, &dst_ports); + free(m->vlans); + m->vlans = vlans; + m->n_vlans = n_vlans; + m->out_port = out_port; + m->out_vlan = out_vlan; + + /* If no selection criteria have been given, mirror for all ports. */ + mirror_all_ports = (!m->src_ports.n) && (!m->dst_ports.n) && (!m->n_vlans); + + /* Update ports. */ + mirror_bit = MIRROR_MASK_C(1) << m->idx; + for (i = 0; i < m->bridge->n_ports; i++) { + struct port *port = m->bridge->ports[i]; + + if (mirror_all_ports + || svec_contains(&m->src_ports, port->name) + || (m->n_vlans + && (!port->vlan + ? port_trunks_any_mirrored_vlan(m, port) + : vlan_is_mirrored(m, port->vlan)))) { + port->src_mirrors |= mirror_bit; + } else { + port->src_mirrors &= ~mirror_bit; + } + + if (mirror_all_ports || svec_contains(&m->dst_ports, port->name)) { + port->dst_mirrors |= mirror_bit; + } else { + port->dst_mirrors &= ~mirror_bit; + } + } + + /* Clean up. */ + svec_destroy(&src_ports); + svec_destroy(&dst_ports); + free(pfx); +} + +/* Spanning tree protocol. */ + +static void brstp_update_port_state(struct port *); + +static void +brstp_send_bpdu(struct ofpbuf *pkt, int port_no, void *br_) +{ + struct bridge *br = br_; + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); + struct iface *iface = iface_from_dp_ifidx(br, port_no); + if (!iface) { + VLOG_WARN_RL(&rl, "%s: cannot send BPDU on unknown port %d", + br->name, port_no); + } else if (eth_addr_is_zero(iface->mac)) { + VLOG_WARN_RL(&rl, "%s: cannot send BPDU on port %d with unknown MAC", + br->name, port_no); + } else { + union ofp_action action; + struct eth_header *eth = pkt->l2; + flow_t flow; + + memcpy(eth->eth_src, iface->mac, ETH_ADDR_LEN); + + memset(&action, 0, sizeof action); + action.type = htons(OFPAT_OUTPUT); + action.output.len = htons(sizeof action); + action.output.port = htons(port_no); + + flow_extract(pkt, ODPP_NONE, &flow); + ofproto_send_packet(br->ofproto, &flow, &action, 1, pkt); + } + ofpbuf_delete(pkt); +} + +static void +brstp_reconfigure(struct bridge *br) +{ + size_t i; + + if (!cfg_get_bool(0, "stp.%s.enabled", br->name)) { + if (br->stp) { + stp_destroy(br->stp); + br->stp = NULL; + + bridge_flush(br); + } + } else { + uint64_t bridge_address, bridge_id; + int bridge_priority; + + bridge_address = cfg_get_mac(0, "stp.%s.address", br->name); + if (!bridge_address) { + if (br->stp) { + bridge_address = (stp_get_bridge_id(br->stp) + & ((UINT64_C(1) << 48) - 1)); + } else { + uint8_t mac[ETH_ADDR_LEN]; + eth_addr_random(mac); + bridge_address = eth_addr_to_uint64(mac); + } + } + + if (cfg_is_valid(CFG_INT | CFG_REQUIRED, "stp.%s.priority", + br->name)) { + bridge_priority = cfg_get_int(0, "stp.%s.priority", br->name); + } else { + bridge_priority = STP_DEFAULT_BRIDGE_PRIORITY; + } + + bridge_id = bridge_address | ((uint64_t) bridge_priority << 48); + if (!br->stp) { + br->stp = stp_create(br->name, bridge_id, brstp_send_bpdu, br); + br->stp_last_tick = time_msec(); + bridge_flush(br); + } else { + if (bridge_id != stp_get_bridge_id(br->stp)) { + stp_set_bridge_id(br->stp, bridge_id); + bridge_flush(br); + } + } + + for (i = 0; i < br->n_ports; i++) { + struct port *p = br->ports[i]; + int dp_ifidx; + struct stp_port *sp; + int path_cost, priority; + bool enable; + + if (!p->n_ifaces) { + continue; + } + dp_ifidx = p->ifaces[0]->dp_ifidx; + if (dp_ifidx < 0 || dp_ifidx >= STP_MAX_PORTS) { + continue; + } + + sp = stp_get_port(br->stp, dp_ifidx); + enable = (!cfg_is_valid(CFG_BOOL | CFG_REQUIRED, + "stp.%s.port.%s.enabled", + br->name, p->name) + || cfg_get_bool(0, "stp.%s.port.%s.enabled", + br->name, p->name)); + if (p->is_mirror_output_port) { + enable = false; + } + if (enable != (stp_port_get_state(sp) != STP_DISABLED)) { + bridge_flush(br); /* Might not be necessary. */ + if (enable) { + stp_port_enable(sp); + } else { + stp_port_disable(sp); + } + } + + path_cost = cfg_get_int(0, "stp.%s.port.%s.path-cost", + br->name, p->name); + stp_port_set_path_cost(sp, path_cost ? path_cost : 19 /* XXX */); + + priority = (cfg_is_valid(CFG_INT | CFG_REQUIRED, + "stp.%s.port.%s.priority", + br->name, p->name) + ? cfg_get_int(0, "stp.%s.port.%s.priority", + br->name, p->name) + : STP_DEFAULT_PORT_PRIORITY); + stp_port_set_priority(sp, priority); + } + + brstp_adjust_timers(br); + } + for (i = 0; i < br->n_ports; i++) { + brstp_update_port_state(br->ports[i]); + } +} + +static void +brstp_update_port_state(struct port *p) +{ + struct bridge *br = p->bridge; + enum stp_state state; + + /* Figure out new state. */ + state = STP_DISABLED; + if (br->stp && p->n_ifaces > 0) { + int dp_ifidx = p->ifaces[0]->dp_ifidx; + if (dp_ifidx >= 0 && dp_ifidx < STP_MAX_PORTS) { + state = stp_port_get_state(stp_get_port(br->stp, dp_ifidx)); + } + } + + /* Update state. */ + if (p->stp_state != state) { + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(10, 10); + VLOG_INFO_RL(&rl, "port %s: STP state changed from %s to %s", + p->name, stp_state_name(p->stp_state), + stp_state_name(state)); + if (p->stp_state == STP_DISABLED) { + bridge_flush(br); + } else { + ofproto_revalidate(p->bridge->ofproto, p->stp_state_tag); + } + p->stp_state = state; + p->stp_state_tag = (p->stp_state == STP_DISABLED ? 0 + : tag_create_random()); + } +} + +static void +brstp_adjust_timers(struct bridge *br) +{ + int hello_time = cfg_get_int(0, "stp.%s.hello-time", br->name); + int max_age = cfg_get_int(0, "stp.%s.max-age", br->name); + int forward_delay = cfg_get_int(0, "stp.%s.forward-delay", br->name); + + stp_set_hello_time(br->stp, hello_time ? hello_time : 2000); + stp_set_max_age(br->stp, max_age ? max_age : 20000); + stp_set_forward_delay(br->stp, forward_delay ? forward_delay : 15000); +} + +static void +brstp_run(struct bridge *br) +{ + if (br->stp) { + long long int now = time_msec(); + long long int elapsed = now - br->stp_last_tick; + struct stp_port *sp; + + if (elapsed > 0) { + stp_tick(br->stp, MIN(INT_MAX, elapsed)); + br->stp_last_tick = now; + } + while (stp_get_changed_port(br->stp, &sp)) { + struct port *p = port_from_dp_ifidx(br, stp_port_no(sp)); + if (p) { + brstp_update_port_state(p); + } + } + } +} + +static void +brstp_wait(struct bridge *br) +{ + if (br->stp) { + poll_timer_wait(1000); + } +} diff --git a/vswitchd/bridge.h b/vswitchd/bridge.h new file mode 100644 index 00000000..b9435370 --- /dev/null +++ b/vswitchd/bridge.h @@ -0,0 +1,43 @@ +/* Copyright (c) 2008, 2009 Nicira Networks + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * In addition, as a special exception, Nicira Networks gives permission + * to link the code of its release of vswitchd with the OpenSSL project's + * "OpenSSL" library (or with modified versions of it that use the same + * license as the "OpenSSL" library), and distribute the linked + * executables. You must obey the GNU General Public License in all + * respects for all of the code used other than "OpenSSL". If you modify + * this file, you may extend this exception to your version of the file, + * but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. + */ + +#ifndef VSWITCHD_BRIDGE_H +#define VSWITCHD_BRIDGE_H 1 + +#include <stddef.h> +#include "list.h" + +struct svec; + +void bridge_init(void); +void bridge_reconfigure(void); +int bridge_run(void); +void bridge_wait(void); +bool bridge_exists(const char *); +uint64_t bridge_get_datapathid(const char *name); +void bridge_get_ifaces(struct svec *svec); + +#endif /* bridge.h */ diff --git a/vswitchd/mgmt.c b/vswitchd/mgmt.c new file mode 100644 index 00000000..f5dcd184 --- /dev/null +++ b/vswitchd/mgmt.c @@ -0,0 +1,679 @@ +/* Copyright (c) 2009 Nicira Networks + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * In addition, as a special exception, Nicira Networks gives permission + * to link the code of its release of vswitchd with the OpenSSL project's + * "OpenSSL" library (or with modified versions of it that use the same + * license as the "OpenSSL" library), and distribute the linked + * executables. You must obey the GNU General Public License in all + * respects for all of the code used other than "OpenSSL". If you modify + * this file, you may extend this exception to your version of the file, + * but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. + * + */ + +#include <config.h> + +#include <arpa/inet.h> +#include <assert.h> +#include <errno.h> +#include <stdlib.h> + +#include "bridge.h" +#include "cfg.h" +#include "coverage.h" +#include "list.h" +#include "mgmt.h" +#include "openflow/nicira-ext.h" +#include "openflow/openflow.h" +#include "openflow/openflow-mgmt.h" +#include "ofpbuf.h" +#include "ovs-vswitchd.h" +#include "packets.h" +#include "rconn.h" +#include "svec.h" +#include "vconn.h" +#include "vconn-ssl.h" +#include "xtoxll.h" + +#define THIS_MODULE VLM_mgmt +#include "vlog.h" + +#define MAX_BACKOFF_DEFAULT 15 +#define INACTIVITY_PROBE_DEFAULT 15 + +static struct svec mgmt_cfg; +static uint8_t cfg_cookie[CFG_COOKIE_LEN]; +static struct rconn *mgmt_rconn; +static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(60, 60); +static struct svec capabilities; +uint64_t mgmt_id; + + +#define TXQ_LIMIT 128 /* Max number of packets to queue for tx. */ +struct rconn_packet_counter *txqlen; /* # pkts queued for tx on mgmt_rconn. */ + +static uint64_t pick_fallback_mgmt_id(void); +static void send_config_update(uint32_t xid, bool use_xid); +static void send_resources_update(uint32_t xid, bool use_xid); + +void +mgmt_init(void) +{ + txqlen = rconn_packet_counter_create(); + + svec_init(&mgmt_cfg); + svec_init(&capabilities); + svec_add_nocopy(&capabilities, + xasprintf("com.nicira.mgmt.manager=true\n")); + + mgmt_id = cfg_get_dpid(0, "mgmt.id"); + if (!mgmt_id) { + /* Randomly generate a mgmt id */ + mgmt_id = pick_fallback_mgmt_id(); + } +} + +#ifdef HAVE_OPENSSL +static bool +config_string_change(const char *key, char **valuep) +{ + const char *value = cfg_get_string(0, "%s", key); + if (value && (!*valuep || strcmp(value, *valuep))) { + free(*valuep); + *valuep = xstrdup(value); + return true; + } else { + return false; + } +} + +static void +mgmt_configure_ssl(void) +{ + static char *private_key_file; + static char *certificate_file; + static char *cacert_file; + + /* XXX SSL should be configurable separate from the bridges. + * XXX should be possible to de-configure SSL. */ + if (config_string_change("ssl.private-key", &private_key_file)) { + vconn_ssl_set_private_key_file(private_key_file); + } + + if (config_string_change("ssl.certificate", &certificate_file)) { + vconn_ssl_set_certificate_file(certificate_file); + } + + if (config_string_change("ssl.ca-cert", &cacert_file)) { + vconn_ssl_set_ca_cert_file(cacert_file, + cfg_get_bool(0, "ssl.bootstrap-ca-cert")); + } +} +#endif + +void +mgmt_reconfigure(void) +{ + struct svec new_cfg; + uint8_t new_cookie[CFG_COOKIE_LEN]; + bool cfg_updated = false; + const char *controller_name; + int max_backoff; + int inactivity_probe; + int retval; + + if (!cfg_has_section("mgmt")) { + if (mgmt_rconn) { + rconn_destroy(mgmt_rconn); + mgmt_rconn = NULL; + } + return; + } + + /* If this is an established connection, send a resources update. */ + /* xxx This is wasteful if there were no resource changes!!! */ + if (mgmt_rconn) { + send_resources_update(0, false); + } + + cfg_get_cookie(new_cookie); + if (memcmp(cfg_cookie, new_cookie, sizeof(cfg_cookie))) { + memcpy(cfg_cookie, new_cookie, sizeof(cfg_cookie)); + cfg_updated = true; + } + + svec_init(&new_cfg); + cfg_get_section(&new_cfg, "mgmt"); + if (svec_equal(&mgmt_cfg, &new_cfg)) { + /* Reconnecting to the controller causes the config file to be + * resent automatically. If we're not reconnecting and the + * config file has changed, we need to notify the controller of + * changes. */ + if (cfg_updated && mgmt_rconn) { + send_config_update(0, false); + } + svec_destroy(&new_cfg); + return; + } + + controller_name = cfg_get_string(0, "mgmt.controller"); + if (!controller_name) { + VLOG_ERR("no controller specified for managment"); + svec_destroy(&new_cfg); + return; + } + + max_backoff = cfg_get_int(0, "mgmt.max-backoff"); + if (max_backoff < 1) { + max_backoff = MAX_BACKOFF_DEFAULT; + } else if (max_backoff > 3600) { + max_backoff = 3600; + } + + inactivity_probe = cfg_get_int(0, "mgmt.inactivity-probe"); + if (inactivity_probe < 5) { + inactivity_probe = INACTIVITY_PROBE_DEFAULT; + } + + /* xxx If this changes, we need to restart bridges to use new id, + * xxx but they need the id before the connect to controller, but we + * xxx need their dpids. */ + /* Check if a different mgmt id has been assigned. */ + if (cfg_has("mgmt.id")) { + uint64_t cfg_mgmt_id = cfg_get_dpid(0, "mgmt.id"); + if (cfg_mgmt_id != mgmt_id) { + mgmt_id = cfg_mgmt_id; + } + } + + svec_swap(&new_cfg, &mgmt_cfg); + svec_destroy(&new_cfg); + +#ifdef HAVE_OPENSSL + /* Configure SSL. */ + mgmt_configure_ssl(); +#endif + + if (mgmt_rconn) { + rconn_destroy(mgmt_rconn); + mgmt_rconn = NULL; + } + mgmt_rconn = rconn_create(inactivity_probe, max_backoff); + retval = rconn_connect(mgmt_rconn, controller_name); + if (retval == EAFNOSUPPORT) { + VLOG_ERR("no support for %s vconn", controller_name); + } +} + +static int +send_openflow_buffer(struct ofpbuf *buffer) +{ + int retval; + + if (!mgmt_rconn) { + VLOG_ERR("attempt to send openflow packet with no rconn\n"); + return EINVAL; + } + + update_openflow_length(buffer); + retval = rconn_send_with_limit(mgmt_rconn, buffer, txqlen, TXQ_LIMIT); + if (retval) { + VLOG_WARN_RL(&rl, "send to %s failed: %s", + rconn_get_name(mgmt_rconn), strerror(retval)); + } + return retval; +} + +static void +send_features_reply(uint32_t xid) +{ + struct ofpbuf *buffer; + struct ofp_switch_features *ofr; + + ofr = make_openflow_xid(sizeof *ofr, OFPT_FEATURES_REPLY, xid, &buffer); + ofr->datapath_id = 0; + ofr->n_tables = 0; + ofr->n_buffers = 0; + ofr->capabilities = 0; + ofr->actions = 0; + send_openflow_buffer(buffer); +} + +static void * +make_ofmp_xid(size_t ofmp_len, uint16_t type, uint32_t xid, + struct ofpbuf **bufferp) +{ + struct ofmp_header *oh; + + oh = make_openflow_xid(ofmp_len, OFPT_VENDOR, xid, bufferp); + oh->header.vendor = htonl(NX_VENDOR_ID); + oh->header.subtype = htonl(NXT_MGMT); + oh->type = htons(type); + + return oh; +} + +static void * +make_ofmp(size_t ofmp_len, uint16_t type, struct ofpbuf **bufferp) +{ + struct ofmp_header *oh; + + oh = make_openflow(ofmp_len, OFPT_VENDOR, bufferp); + oh->header.vendor = htonl(NX_VENDOR_ID); + oh->header.subtype = htonl(NXT_MGMT); + oh->type = htons(type); + + return oh; +} + +static void +send_capability_reply(uint32_t xid) +{ + int i; + struct ofpbuf *buffer; + struct ofmp_capability_reply *ofmpcr; + + ofmpcr = make_ofmp_xid(sizeof *ofmpcr, OFMPT_CAPABILITY_REPLY, + xid, &buffer); + ofmpcr->format = htonl(OFMPCOF_SIMPLE); + ofmpcr->mgmt_id = htonll(mgmt_id); + for (i=0; i<capabilities.n; i++) { + ofpbuf_put(buffer, capabilities.names[i], + strlen(capabilities.names[i])); + } + send_openflow_buffer(buffer); +} + +static void +send_resources_update(uint32_t xid, bool use_xid) +{ + struct ofpbuf *buffer; + struct ofmp_resources_update *ofmpru; + struct ofmp_tlv *tlv; + struct svec br_list; + int i; + + if (use_xid) { + ofmpru = make_ofmp_xid(sizeof *ofmpru, OFMPT_RESOURCES_UPDATE, + xid, &buffer); + } else { + ofmpru = make_ofmp(sizeof *ofmpru, OFMPT_RESOURCES_UPDATE, &buffer); + } + + svec_init(&br_list); + cfg_get_subsections(&br_list, "bridge"); + for (i=0; i < br_list.n; i++) { + struct ofmptsr_dp *dp_tlv; + uint64_t dp_id = bridge_get_datapathid(br_list.names[i]); + if (!dp_id) { + VLOG_WARN_RL(&rl, "bridge %s doesn't seem to exist", + br_list.names[i]); + continue; + } + dp_tlv = ofpbuf_put_zeros(buffer, sizeof(*dp_tlv)); + dp_tlv->type = htons(OFMPTSR_DP); + dp_tlv->len = htons(sizeof(*dp_tlv)); + + dp_tlv->dp_id = htonll(dp_id); + memcpy(dp_tlv->name, br_list.names[i], strlen(br_list.names[i])+1); + } + + /* Put end marker. */ + tlv = ofpbuf_put_zeros(buffer, sizeof(*tlv)); + tlv->type = htons(OFMPTSR_END); + tlv->len = htons(sizeof(*tlv)); + send_openflow_buffer(buffer); +} + +static void +send_config_update(uint32_t xid, bool use_xid) +{ + struct ofpbuf *buffer; + struct ofmp_config_update *ofmpcu; + + if (use_xid) { + ofmpcu = make_ofmp_xid(sizeof *ofmpcu, OFMPT_CONFIG_UPDATE, + xid, &buffer); + } else { + ofmpcu = make_ofmp(sizeof *ofmpcu, OFMPT_CONFIG_UPDATE, &buffer); + } + + ofmpcu->format = htonl(OFMPCOF_SIMPLE); + memcpy(ofmpcu->cookie, cfg_cookie, sizeof(ofmpcu->cookie)); + cfg_buf_put(buffer); + send_openflow_buffer(buffer); +} + +static void +send_config_update_ack(uint32_t xid, bool success) +{ + struct ofpbuf *buffer; + struct ofmp_config_update_ack *ofmpcua; + + ofmpcua = make_ofmp_xid(sizeof *ofmpcua, OFMPT_CONFIG_UPDATE_ACK, + xid, &buffer); + + ofmpcua->format = htonl(OFMPCOF_SIMPLE); + if (success) { + ofmpcua->flags = htonl(OFMPCUAF_SUCCESS); + } + cfg_get_cookie(ofmpcua->cookie); + send_openflow_buffer(buffer); +} + +static void +send_ofmp_error_msg(uint32_t xid, uint16_t type, uint16_t code, + const void *data, size_t len) +{ + struct ofpbuf *buffer; + struct ofmp_error_msg *oem; + + oem = make_ofmp_xid(sizeof(*oem)+len, OFMPT_ERROR, xid, &buffer); + oem->type = htons(type); + oem->code = htons(code); + memcpy(oem->data, data, len); + send_openflow_buffer(buffer); +} + +static void +send_error_msg(uint32_t xid, uint16_t type, uint16_t code, + const void *data, size_t len) +{ + struct ofpbuf *buffer; + struct ofp_error_msg *oem; + + oem = make_openflow_xid(sizeof(*oem)+len, OFPT_ERROR, xid, &buffer); + oem->type = htons(type); + oem->code = htons(code); + memcpy(oem->data, data, len); + send_openflow_buffer(buffer); +} + +static int +recv_echo_request(uint32_t xid UNUSED, const void *msg) +{ + const struct ofp_header *rq = msg; + send_openflow_buffer(make_echo_reply(rq)); + return 0; +} + +static int +recv_features_request(uint32_t xid, const void *msg UNUSED) +{ + send_features_reply(xid); + return 0; +} + +static int +recv_set_config(uint32_t xid UNUSED, const void *msg UNUSED) +{ + /* Nothing to configure! */ + return 0; +} + +static int +recv_ofmp_capability_request(uint32_t xid, const struct ofmp_header *ofmph) +{ + struct ofmp_capability_request *ofmpcr; + + if (htons(ofmph->header.header.length) != sizeof(*ofmpcr)) { + /* xxx Send error */ + return -EINVAL; + } + + ofmpcr = (struct ofmp_capability_request *)ofmph; + if (ofmpcr->format != htonl(OFMPCAF_SIMPLE)) { + /* xxx Send error */ + return -EINVAL; + } + + send_capability_reply(xid); + + return 0; +} + +static int +recv_ofmp_resources_request(uint32_t xid, const void *msg UNUSED) +{ + send_resources_update(xid, true); + return 0; +} + +static int +recv_ofmp_config_request(uint32_t xid, const struct ofmp_header *ofmph) +{ + struct ofmp_config_request *ofmpcr; + + if (htons(ofmph->header.header.length) != sizeof(*ofmpcr)) { + /* xxx Send error */ + return -EINVAL; + } + + ofmpcr = (struct ofmp_config_request *)ofmph; + if (ofmpcr->format != htonl(OFMPCOF_SIMPLE)) { + /* xxx Send error */ + return -EINVAL; + } + + send_config_update(xid, true); + + return 0; +} + +static int +recv_ofmp_config_update(uint32_t xid, const struct ofmp_header *ofmph) +{ + struct ofmp_config_update *ofmpcu; + int data_len; + + data_len = htons(ofmph->header.header.length) - sizeof(*ofmpcu); + if (data_len <= sizeof(*ofmpcu)) { + /* xxx Send error. */ + return -EINVAL; + } + + ofmpcu = (struct ofmp_config_update *)ofmph; + if (ofmpcu->format != htonl(OFMPCOF_SIMPLE)) { + /* xxx Send error */ + return -EINVAL; + } + + /* Check if the supplied cookie matches our current understanding of + * it. If they don't match, tell the controller and let it sort + * things out. */ + if (cfg_lock(ofmpcu->cookie, 0)) { + /* xxx cfg_lock can fail for other reasons, such as being + * xxx locked... */ + VLOG_WARN_RL(&rl, "config update failed due to bad cookie\n"); + send_config_update_ack(xid, false); + return 0; + } + + /* xxx We should probably do more sanity checking than this. */ + + cfg_write_data(ofmpcu->data, data_len); + cfg_unlock(); + + /* Send the ACK before running reconfigure, since our management + * connection settings may have changed. */ + send_config_update_ack(xid, true); + + reconfigure(); + + + return 0; +} + +static +int recv_ofmp(uint32_t xid, struct ofmp_header *ofmph) +{ + /* xxx Should sanity-check for min/max length */ + switch (ntohs(ofmph->type)) + { + case OFMPT_CAPABILITY_REQUEST: + return recv_ofmp_capability_request(xid, ofmph); + case OFMPT_RESOURCES_REQUEST: + return recv_ofmp_resources_request(xid, ofmph); + case OFMPT_CONFIG_REQUEST: + return recv_ofmp_config_request(xid, ofmph); + case OFMPT_CONFIG_UPDATE: + return recv_ofmp_config_update(xid, ofmph); + default: + VLOG_WARN_RL(&rl, "unknown mgmt message: %d", + ntohs(ofmph->type)); + return -EINVAL; + } +} + +static int +recv_nx_msg(uint32_t xid, const void *oh) +{ + const struct nicira_header *nh = oh; + + switch (ntohl(nh->subtype)) { + + case NXT_MGMT: + return recv_ofmp(xid, (struct ofmp_header *)oh); + + default: + send_error_msg(xid, OFPET_BAD_REQUEST, OFPBRC_BAD_SUBTYPE, + oh, htons(nh->header.length)); + return -EINVAL; + } +} + +static int +recv_vendor(uint32_t xid, const void *oh) +{ + const struct ofp_vendor_header *ovh = oh; + + switch (ntohl(ovh->vendor)) + { + case NX_VENDOR_ID: + return recv_nx_msg(xid, oh); + + default: + VLOG_WARN_RL(&rl, "unknown vendor: 0x%x", ntohl(ovh->vendor)); + send_error_msg(xid, OFPET_BAD_REQUEST, OFPBRC_BAD_VENDOR, + oh, ntohs(ovh->header.length)); + return -EINVAL; + } +} + +static int +handle_msg(uint32_t xid, const void *msg, size_t length) +{ + int (*handler)(uint32_t, const void *); + struct ofp_header *oh; + size_t min_size; + + COVERAGE_INC(mgmt_received); + + /* Check encapsulated length. */ + oh = (struct ofp_header *) msg; + if (ntohs(oh->length) > length) { + return -EINVAL; + } + assert(oh->version == OFP_VERSION); + + /* Figure out how to handle it. */ + switch (oh->type) { + case OFPT_ECHO_REQUEST: + min_size = sizeof(struct ofp_header); + handler = recv_echo_request; + break; + case OFPT_ECHO_REPLY: + return 0; + case OFPT_FEATURES_REQUEST: + min_size = sizeof(struct ofp_header); + handler = recv_features_request; + break; + case OFPT_SET_CONFIG: + min_size = sizeof(struct ofp_switch_config); + handler = recv_set_config; + break; + case OFPT_VENDOR: + min_size = sizeof(struct ofp_vendor_header); + handler = recv_vendor; + break; + default: + VLOG_WARN_RL(&rl, "unknown openflow type: %d", oh->type); + send_error_msg(xid, OFPET_BAD_REQUEST, OFPBRC_BAD_TYPE, + msg, length); + return -EINVAL; + } + + /* Handle it. */ + if (length < min_size) { + return -EFAULT; + } + return handler(xid, msg); +} + +void +mgmt_run(void) +{ + int i; + + if (!mgmt_rconn) { + return; + } + + rconn_run(mgmt_rconn); + + /* Do some processing, but cap it at a reasonable amount so that + * other processing doesn't starve. */ + for (i=0; i<50; i++) { + struct ofpbuf *buffer; + struct ofp_header *oh; + + buffer = rconn_recv(mgmt_rconn); + if (!buffer) { + break; + } + + if (buffer->size >= sizeof *oh) { + oh = buffer->data; + handle_msg(oh->xid, buffer->data, buffer->size); + ofpbuf_delete(buffer); + } else { + VLOG_WARN_RL(&rl, "received too-short OpenFlow message"); + } + } +} + +void +mgmt_wait(void) +{ + if (!mgmt_rconn) { + return; + } + + rconn_run_wait(mgmt_rconn); + rconn_recv_wait(mgmt_rconn); +} + +static uint64_t +pick_fallback_mgmt_id(void) +{ + uint8_t ea[ETH_ADDR_LEN]; + eth_addr_random(ea); + ea[0] = 0x00; /* Set Nicira OUI. */ + ea[1] = 0x23; + ea[2] = 0x20; + return eth_addr_to_uint64(ea); +} diff --git a/vswitchd/mgmt.h b/vswitchd/mgmt.h new file mode 100644 index 00000000..6db66c69 --- /dev/null +++ b/vswitchd/mgmt.h @@ -0,0 +1,36 @@ +/* Copyright (c) 2009 Nicira Networks + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * In addition, as a special exception, Nicira Networks gives permission + * to link the code of its release of vswitchd with the OpenSSL project's + * "OpenSSL" library (or with modified versions of it that use the same + * license as the "OpenSSL" library), and distribute the linked + * executables. You must obey the GNU General Public License in all + * respects for all of the code used other than "OpenSSL". If you modify + * this file, you may extend this exception to your version of the file, + * but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. + */ + +#ifndef VSWITCHD_MGMT_H +#define VSWITCHD_MGMT_H 1 + +void mgmt_init(void); +void mgmt_reconfigure(void); +void mgmt_run(void); +void mgmt_wait(void); +uint64_t mgmt_get_mgmt_id(void); + +#endif /* mgmt.h */ diff --git a/vswitchd/ovs-brcompatd.8.in b/vswitchd/ovs-brcompatd.8.in new file mode 100644 index 00000000..ebd67028 --- /dev/null +++ b/vswitchd/ovs-brcompatd.8.in @@ -0,0 +1,49 @@ +.TH ovs\-brcompatd 8 "March 2009" "Open vSwitch" "Open vSwitch Manual" +.ds PN ovs\-brcompatd +. +.SH NAME +ovs\-brcompatd \- Bridge compatibility front-end for ovs\-vswitchd +. +.SH SYNOPSIS +.B ovs\-brcompatd +[\fIoptions\fR] \fIconfig\fR +. +.SH DESCRIPTION +A daemon that provides a legacy bridge front-end for \fBovs\-vswitchd\fR. It +does this by listening for bridge ioctl commands (e.g., those generated by +the \fBbrctl\fR program) to add or remove datapaths and the interfaces +that attach to them. It modifies \fIconfig\fR and forces +\fBovs\-vswitchd\fR to reload its configuration file. +.PP +.SH OPTIONS +.IP "\fB--reload-command=\fIcommand\fR" +Sets the command that \fBovs\-brcompatd\fR runs to force \fBovs\-vswitchd\fR to +reload its configuration file to \fIcommand\fR. The command is run in +a subshell, so it may contain arbitrary shell metacharacters, etc. +The \fB--help\fR option displays the default reload command. +.TP +\fB--prune-timeout=\fIsecs\fR +. +Sets the maximum time between pruning port entries to \fIsecs\fR seconds. +Pruning ports is the process of removing port entries from \fIconfig\fR +that no longer exist. If \fIsecs\fR is zero, then entries are never +pruned. The default prune timeout is 5 seconds. +.TP +\fB--lock-timeout=\fImsecs\fR +. +Sets the maximum time to wait for \fIconfig\fR to become unlocked to +\fImsecs\fR milliseconds. The default lock timeout is 500 milliseconds. +. +.so lib/daemon.man +.so lib/vlog.man +.so lib/common.man +.so lib/leak-checker.man +. +.SH NOTES +\fBovs\-brcompatd\fR requires the \fBbrcompat_mod.ko\fR kernel module to be +loaded. +.SH "SEE ALSO" +.BR ovs\-appctl (8), +.BR ovs\-vswitchd (8), +.BR ovs\-vswitchd.conf (5), +\fBINSTALL\fR in the Open vSwitch distribution. diff --git a/vswitchd/ovs-brcompatd.c b/vswitchd/ovs-brcompatd.c new file mode 100644 index 00000000..93d9469b --- /dev/null +++ b/vswitchd/ovs-brcompatd.c @@ -0,0 +1,766 @@ +/* Copyright (c) 2008, 2009 Nicira Networks + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + */ + +#include <config.h> + +#include <assert.h> +#include <errno.h> +#include <getopt.h> +#include <inttypes.h> +#include <limits.h> +#include <net/if.h> +#include <linux/genetlink.h> +#include <linux/rtnetlink.h> +#include <signal.h> +#include <stdlib.h> +#include <string.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <unistd.h> + +#include "cfg.h" +#include "command-line.h" +#include "coverage.h" +#include "daemon.h" +#include "dirs.h" +#include "dpif.h" +#include "fatal-signal.h" +#include "fault.h" +#include "leak-checker.h" +#include "netdev.h" +#include "netlink.h" +#include "ofpbuf.h" +#include "openvswitch/brcompat-netlink.h" +#include "poll-loop.h" +#include "process.h" +#include "signals.h" +#include "svec.h" +#include "timeval.h" +#include "unixctl.h" +#include "util.h" + +#include "vlog.h" +#define THIS_MODULE VLM_brcompatd + + +/* xxx Just hangs if datapath is rmmod/insmod. Learn to reconnect? */ + +/* Actions to modify bridge compatibility configuration. */ +enum bmc_action { + BMC_ADD_DP, + BMC_DEL_DP, + BMC_ADD_PORT, + BMC_DEL_PORT +}; + +static void parse_options(int argc, char *argv[]); +static void usage(void) NO_RETURN; + +static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 60); + +/* Maximum number of milliseconds to wait for the config file to be + * unlocked. If set to zero, no waiting will occur. */ +static int lock_timeout = 500; + +/* Maximum number of milliseconds to wait before pruning port entries that + * no longer exist. If set to zero, ports are never pruned. */ +static int prune_timeout = 5000; + +/* Config file shared with ovs-vswitchd (usually ovs-vswitchd.conf). */ +static char *config_file; + +/* Command to run (via system()) to reload the ovs-vswitchd configuration + * file. */ +static char *reload_command; + +/* Netlink socket to listen for interface changes. */ +static struct nl_sock *rtnl_sock; + +/* Netlink socket to bridge compatibility kernel module. */ +static struct nl_sock *brc_sock; + +/* The Generic Netlink family number used for bridge compatibility. */ +static int brc_family; + +static const struct nl_policy brc_multicast_policy[] = { + [BRC_GENL_A_MC_GROUP] = {.type = NL_A_U32 } +}; + +static const struct nl_policy rtnlgrp_link_policy[] = { + [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false }, + [IFLA_MASTER] = { .type = NL_A_U32, .optional = true }, +}; + +static int +lookup_brc_multicast_group(int *multicast_group) +{ + struct nl_sock *sock; + struct ofpbuf request, *reply; + struct nlattr *attrs[ARRAY_SIZE(brc_multicast_policy)]; + int retval; + + retval = nl_sock_create(NETLINK_GENERIC, 0, 0, 0, &sock); + if (retval) { + return retval; + } + ofpbuf_init(&request, 0); + nl_msg_put_genlmsghdr(&request, sock, 0, brc_family, + NLM_F_REQUEST, BRC_GENL_C_QUERY_MC, 1); + retval = nl_sock_transact(sock, &request, &reply); + ofpbuf_uninit(&request); + if (retval) { + nl_sock_destroy(sock); + return retval; + } + if (!nl_policy_parse(reply, NLMSG_HDRLEN + GENL_HDRLEN, + brc_multicast_policy, attrs, + ARRAY_SIZE(brc_multicast_policy))) { + nl_sock_destroy(sock); + ofpbuf_delete(reply); + return EPROTO; + } + *multicast_group = nl_attr_get_u32(attrs[BRC_GENL_A_MC_GROUP]); + nl_sock_destroy(sock); + ofpbuf_delete(reply); + + return 0; +} + +/* Opens a socket for brcompat notifications. Returns 0 if successful, + * otherwise a positive errno value. */ +static int +brc_open(struct nl_sock **sock) +{ + int multicast_group = 0; + int retval; + + retval = nl_lookup_genl_family(BRC_GENL_FAMILY_NAME, &brc_family); + if (retval) { + return retval; + } + + retval = lookup_brc_multicast_group(&multicast_group); + if (retval) { + return retval; + } + + retval = nl_sock_create(NETLINK_GENERIC, multicast_group, 0, 0, sock); + if (retval) { + return retval; + } + + return 0; +} + +static const struct nl_policy brc_dp_policy[] = { + [BRC_GENL_A_DP_NAME] = { .type = NL_A_STRING }, +}; + +static bool +bridge_exists(const char *name) +{ + return cfg_has_section("bridge.%s", name); +} + +static int +rewrite_and_reload_config(void) +{ + if (cfg_is_dirty()) { + int error1 = cfg_write(); + int error2 = cfg_read(); + long long int reload_start = time_msec(); + int error3 = system(reload_command); + long long int elapsed = time_msec() - reload_start; + COVERAGE_INC(brcompatd_reload); + if (elapsed > 0) { + VLOG_INFO("reload command executed in %lld ms", elapsed); + } + if (error3 == -1) { + VLOG_ERR("failed to execute reload command: %s", strerror(errno)); + } else if (error3 != 0) { + char *msg = process_status_msg(error3); + VLOG_ERR("reload command exited with error (%s)", msg); + free(msg); + } + return error1 ? error1 : error2 ? error2 : error3 ? ECHILD : 0; + } + return 0; +} + +/* Go through the configuration file and remove any ports that no longer + * exist associated with a bridge. */ +static void +prune_ports(void) +{ + int i, j; + int error; + struct svec bridges, delete; + + if (cfg_lock(NULL, 0)) { + /* Couldn't lock config file. */ + return; + } + + svec_init(&bridges); + svec_init(&delete); + cfg_get_subsections(&bridges, "bridge"); + for (i=0; i<bridges.n; i++) { + const char *br_name = bridges.names[i]; + struct svec ports, ifaces; + + svec_init(&ports); + + /* Get all the interfaces for the given bridge, breaking bonded + * interfaces down into their constituent parts. */ + svec_init(&ifaces); + cfg_get_all_keys(&ports, "bridge.%s.port", br_name); + for (j=0; j<ports.n; j++) { + const char *port_name = ports.names[j]; + if (cfg_has_section("bonding.%s", port_name)) { + struct svec slaves; + svec_init(&slaves); + cfg_get_all_keys(&slaves, "bonding.%s.slave", port_name); + svec_append(&ifaces, &slaves); + svec_destroy(&slaves); + } else { + svec_add(&ifaces, port_name); + } + } + svec_destroy(&ports); + + /* Check that the interfaces exist. */ + for (j = 0; j < ifaces.n; j++) { + const char *iface_name = ifaces.names[j]; + enum netdev_flags flags; + + /* The local port and internal ports are created and destroyed by + * ovs-vswitchd itself, so don't bother checking for them at all. + * In practice, they might not exist if ovs-vswitchd hasn't + * finished reloading since the configuration file was updated. */ + if (!strcmp(iface_name, br_name) + || cfg_get_bool(0, "iface.%s.internal", iface_name)) { + continue; + } + + error = netdev_nodev_get_flags(iface_name, &flags); + if (error == ENODEV) { + VLOG_DBG_RL(&rl, "removing dead interface %s from %s", + iface_name, br_name); + svec_add(&delete, iface_name); + } else if (error) { + VLOG_DBG_RL(&rl, "unknown error %d on interface %s from %s", + error, iface_name, br_name); + } + } + svec_destroy(&ifaces); + } + svec_destroy(&bridges); + + if (delete.n) { + size_t i; + + for (i = 0; i < delete.n; i++) { + cfg_del_match("bridge.*.port=%s", delete.names[i]); + cfg_del_match("bonding.*.slave=%s", delete.names[i]); + } + rewrite_and_reload_config(); + cfg_unlock(); + } else { + cfg_unlock(); + } + svec_destroy(&delete); +} + + +/* Checks whether a network device named 'name' exists and returns true if so, + * false otherwise. + * + * XXX it is possible that this doesn't entirely accomplish what we want in + * context, since ovs-vswitchd.conf may cause vswitchd to create or destroy + * network devices based on iface.*.internal settings. + * + * XXX may want to move this to lib/netdev. */ +static bool +netdev_exists(const char *name) +{ + struct stat s; + char *filename; + int error; + + filename = xasprintf("/sys/class/net/%s", name); + error = stat(filename, &s); + free(filename); + return !error; +} + +static int +add_bridge(const char *br_name) +{ + if (bridge_exists(br_name)) { + VLOG_WARN("addbr %s: bridge %s exists", br_name, br_name); + return EEXIST; + } else if (netdev_exists(br_name)) { + if (cfg_get_bool(0, "iface.%s.fake-bridge", br_name)) { + VLOG_WARN("addbr %s: %s exists as a fake bridge", + br_name, br_name); + return 0; + } else { + VLOG_WARN("addbr %s: cannot create bridge %s because a network " + "device named %s already exists", + br_name, br_name, br_name); + return EEXIST; + } + } + + cfg_add_entry("bridge.%s.port=%s", br_name, br_name); + VLOG_INFO("addbr %s: success", br_name); + + return 0; +} + +static int +del_bridge(const char *br_name) +{ + if (!bridge_exists(br_name)) { + VLOG_WARN("delbr %s: no bridge named %s", br_name, br_name); + return ENXIO; + } + + cfg_del_section("bridge.%s", br_name); + VLOG_INFO("delbr %s: success", br_name); + + return 0; +} + +static int +parse_command(struct ofpbuf *buffer, uint32_t *seq, const char **br_name, + const char **port_name) +{ + static const struct nl_policy policy[] = { + [BRC_GENL_A_DP_NAME] = { .type = NL_A_STRING }, + [BRC_GENL_A_PORT_NAME] = { .type = NL_A_STRING, .optional = true }, + }; + struct nlattr *attrs[ARRAY_SIZE(policy)]; + + if (!nl_policy_parse(buffer, NLMSG_HDRLEN + GENL_HDRLEN, policy, + attrs, ARRAY_SIZE(policy)) + || (port_name && !attrs[BRC_GENL_A_PORT_NAME])) { + return EINVAL; + } + + *seq = ((struct nlmsghdr *) buffer->data)->nlmsg_seq; + *br_name = nl_attr_get_string(attrs[BRC_GENL_A_DP_NAME]); + if (port_name) { + *port_name = nl_attr_get_string(attrs[BRC_GENL_A_PORT_NAME]); + } + return 0; +} + +static void +send_reply(uint32_t seq, int error) +{ + struct ofpbuf msg; + int retval; + + /* Compose reply. */ + ofpbuf_init(&msg, 0); + nl_msg_put_genlmsghdr(&msg, brc_sock, 32, brc_family, NLM_F_REQUEST, + BRC_GENL_C_DP_RESULT, 1); + ((struct nlmsghdr *) msg.data)->nlmsg_seq = seq; + nl_msg_put_u32(&msg, BRC_GENL_A_ERR_CODE, error); + + /* Send reply. */ + retval = nl_sock_send(brc_sock, &msg, false); + if (retval) { + VLOG_WARN_RL(&rl, "replying to brcompat request: %s", + strerror(retval)); + } + ofpbuf_uninit(&msg); +} + +static int +handle_bridge_cmd(struct ofpbuf *buffer, bool add) +{ + const char *br_name; + uint32_t seq; + int error; + + error = parse_command(buffer, &seq, &br_name, NULL); + if (!error) { + error = add ? add_bridge(br_name) : del_bridge(br_name); + if (!error) { + error = rewrite_and_reload_config(); + } + send_reply(seq, error); + } + return error; +} + +static const struct nl_policy brc_port_policy[] = { + [BRC_GENL_A_DP_NAME] = { .type = NL_A_STRING }, + [BRC_GENL_A_PORT_NAME] = { .type = NL_A_STRING }, +}; + +static void +del_port(const char *br_name, const char *port_name) +{ + cfg_del_entry("bridge.%s.port=%s", br_name, port_name); + cfg_del_match("bonding.*.slave=%s", port_name); + cfg_del_match("vlan.%s.*", port_name); +} + +static int +handle_port_cmd(struct ofpbuf *buffer, bool add) +{ + const char *cmd_name = add ? "add-if" : "del-if"; + const char *br_name, *port_name; + uint32_t seq; + int error; + + error = parse_command(buffer, &seq, &br_name, &port_name); + if (!error) { + if (!bridge_exists(br_name)) { + VLOG_WARN("%s %s %s: no bridge named %s", + cmd_name, br_name, port_name, br_name); + error = EINVAL; + } else if (!netdev_exists(port_name)) { + VLOG_WARN("%s %s %s: no network device named %s", + cmd_name, br_name, port_name, port_name); + error = EINVAL; + } else { + if (add) { + cfg_add_entry("bridge.%s.port=%s", br_name, port_name); + } else { + del_port(br_name, port_name); + } + VLOG_INFO("%s %s %s: success", cmd_name, br_name, port_name); + error = rewrite_and_reload_config(); + } + send_reply(seq, error); + } + + return error; +} + +static int +brc_recv_update(void) +{ + int retval; + struct ofpbuf *buffer; + struct genlmsghdr *genlmsghdr; + + + buffer = NULL; + do { + ofpbuf_delete(buffer); + retval = nl_sock_recv(brc_sock, &buffer, false); + } while (retval == ENOBUFS + || (!retval + && (nl_msg_nlmsgerr(buffer, NULL) + || nl_msg_nlmsghdr(buffer)->nlmsg_type == NLMSG_DONE))); + if (retval) { + if (retval != EAGAIN) { + VLOG_WARN_RL(&rl, "brc_recv_update: %s", strerror(retval)); + } + return retval; + } + + genlmsghdr = nl_msg_genlmsghdr(buffer); + if (!genlmsghdr) { + VLOG_WARN_RL(&rl, "received packet too short for generic NetLink"); + goto error; + } + + if (nl_msg_nlmsghdr(buffer)->nlmsg_type != brc_family) { + VLOG_DBG_RL(&rl, "received type (%"PRIu16") != brcompat family (%d)", + nl_msg_nlmsghdr(buffer)->nlmsg_type, brc_family); + goto error; + } + + if (cfg_lock(NULL, lock_timeout)) { + /* Couldn't lock config file. */ + retval = EAGAIN; + goto error; + } + + switch (genlmsghdr->cmd) { + case BRC_GENL_C_DP_ADD: + retval = handle_bridge_cmd(buffer, true); + break; + + case BRC_GENL_C_DP_DEL: + retval = handle_bridge_cmd(buffer, false); + break; + + case BRC_GENL_C_PORT_ADD: + retval = handle_port_cmd(buffer, true); + break; + + case BRC_GENL_C_PORT_DEL: + retval = handle_port_cmd(buffer, false); + break; + + default: + retval = EPROTO; + } + + cfg_unlock(); + +error: + ofpbuf_delete(buffer); + return retval; +} + +/* Check for interface configuration changes announced through RTNL. */ +static void +rtnl_recv_update(void) +{ + struct ofpbuf *buf; + + int error = nl_sock_recv(rtnl_sock, &buf, false); + if (error == EAGAIN) { + /* Nothing to do. */ + } else if (error == ENOBUFS) { + VLOG_WARN_RL(&rl, "network monitor socket overflowed"); + } else if (error) { + VLOG_WARN_RL(&rl, "error on network monitor socket: %s", + strerror(error)); + } else { + struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)]; + struct nlmsghdr *nlh; + struct ifinfomsg *iim; + + nlh = ofpbuf_at(buf, 0, NLMSG_HDRLEN); + iim = ofpbuf_at(buf, NLMSG_HDRLEN, sizeof *iim); + if (!iim) { + VLOG_WARN_RL(&rl, "received bad rtnl message (no ifinfomsg)"); + ofpbuf_delete(buf); + return; + } + + if (!nl_policy_parse(buf, NLMSG_HDRLEN + sizeof(struct ifinfomsg), + rtnlgrp_link_policy, + attrs, ARRAY_SIZE(rtnlgrp_link_policy))) { + VLOG_WARN_RL(&rl,"received bad rtnl message (policy)"); + ofpbuf_delete(buf); + return; + } + if (nlh->nlmsg_type == RTM_DELLINK && attrs[IFLA_MASTER]) { + const char *port_name = nl_attr_get_string(attrs[IFLA_IFNAME]); + char br_name[IFNAMSIZ]; + uint32_t br_idx = nl_attr_get_u32(attrs[IFLA_MASTER]); + struct svec ports; + + if (!if_indextoname(br_idx, br_name)) { + ofpbuf_delete(buf); + return; + } + + if (cfg_lock(NULL, lock_timeout)) { + /* Couldn't lock config file. */ + /* xxx this should try again and print error msg. */ + ofpbuf_delete(buf); + return; + } + + svec_init(&ports); + cfg_get_all_keys(&ports, "bridge.%s.port", br_name); + svec_sort(&ports); + if (svec_contains(&ports, port_name)) { + del_port(br_name, port_name); + rewrite_and_reload_config(); + } + cfg_unlock(); + } + ofpbuf_delete(buf); + } +} + +int +main(int argc, char *argv[]) +{ + struct unixctl_server *unixctl; + int retval; + + set_program_name(argv[0]); + register_fault_handlers(); + time_init(); + vlog_init(); + parse_options(argc, argv); + signal(SIGPIPE, SIG_IGN); + process_init(); + + die_if_already_running(); + daemonize(); + + retval = unixctl_server_create(NULL, &unixctl); + if (retval) { + ovs_fatal(retval, "could not listen for vlog connections"); + } + + if (brc_open(&brc_sock)) { + ovs_fatal(0, "could not open brcompat socket. Check " + "\"brcompat\" kernel module."); + } + + if (prune_timeout) { + if (nl_sock_create(NETLINK_ROUTE, RTNLGRP_LINK, 0, 0, &rtnl_sock)) { + ovs_fatal(0, "could not create rtnetlink socket"); + } + } + + cfg_read(); + + for (;;) { + unixctl_server_run(unixctl); + brc_recv_update(); + + /* If 'prune_timeout' is non-zero, we actively prune from the + * config file any 'bridge.<br_name>.port' entries that are no + * longer valid. We use two methods: + * + * 1) The kernel explicitly notifies us of removed ports + * through the RTNL messages. + * + * 2) We periodically check all ports associated with bridges + * to see if they no longer exist. + */ + if (prune_timeout) { + rtnl_recv_update(); + prune_ports(); + + nl_sock_wait(rtnl_sock, POLLIN); + poll_timer_wait(prune_timeout); + } + + nl_sock_wait(brc_sock, POLLIN); + unixctl_server_wait(unixctl); + poll_block(); + } + + return 0; +} + +static void +parse_options(int argc, char *argv[]) +{ + enum { + OPT_LOCK_TIMEOUT = UCHAR_MAX + 1, + OPT_PRUNE_TIMEOUT, + OPT_RELOAD_COMMAND, + VLOG_OPTION_ENUMS, + LEAK_CHECKER_OPTION_ENUMS + }; + static struct option long_options[] = { + {"help", no_argument, 0, 'h'}, + {"version", no_argument, 0, 'V'}, + {"lock-timeout", required_argument, 0, OPT_LOCK_TIMEOUT}, + {"prune-timeout", required_argument, 0, OPT_PRUNE_TIMEOUT}, + {"reload-command", required_argument, 0, OPT_RELOAD_COMMAND}, + DAEMON_LONG_OPTIONS, + VLOG_LONG_OPTIONS, + LEAK_CHECKER_LONG_OPTIONS, + {0, 0, 0, 0}, + }; + char *short_options = long_options_to_short_options(long_options); + int error; + + reload_command = xasprintf("%s/ovs-appctl -t " + "%s/ovs-vswitchd.`cat %s/ovs-vswitchd.pid`.ctl " + "-e vswitchd/reload 2>&1 " + "| /usr/bin/logger -t brcompatd-reload", + ovs_bindir, ovs_rundir, ovs_rundir); + for (;;) { + int c; + + c = getopt_long(argc, argv, short_options, long_options, NULL); + if (c == -1) { + break; + } + + switch (c) { + case 'H': + case 'h': + usage(); + + case 'V': + OVS_PRINT_VERSION(0, 0); + exit(EXIT_SUCCESS); + + case OPT_LOCK_TIMEOUT: + lock_timeout = atoi(optarg); + break; + + case OPT_PRUNE_TIMEOUT: + prune_timeout = atoi(optarg) * 1000; + break; + + case OPT_RELOAD_COMMAND: + reload_command = optarg; + break; + + VLOG_OPTION_HANDLERS + DAEMON_OPTION_HANDLERS + LEAK_CHECKER_OPTION_HANDLERS + + case '?': + exit(EXIT_FAILURE); + + default: + abort(); + } + } + free(short_options); + + argc -= optind; + argv += optind; + + if (argc != 1) { + ovs_fatal(0, "exactly one non-option argument required; " + "use --help for usage"); + } + + config_file = argv[0]; + error = cfg_set_file(config_file); + if (error) { + ovs_fatal(error, "failed to add configuration file \"%s\"", + config_file); + } +} + +static void +usage(void) +{ + printf("%s: bridge compatibility front-end for ovs-vswitchd\n" + "usage: %s [OPTIONS] CONFIG\n" + "CONFIG is the configuration file used by ovs-vswitchd.\n", + program_name, program_name); + printf("\nConfiguration options:\n" + " --reload-command=COMMAND shell command to reload ovs-vswitchd\n" + " --prune-timeout=SECS wait at most SECS before pruning ports\n" + " --lock-timeout=MSECS wait at most MSECS for CONFIG to unlock\n" + ); + daemon_usage(); + vlog_usage(); + printf("\nOther options:\n" + " -h, --help display this help message\n" + " -V, --version display version information\n"); + leak_checker_usage(); + printf("\nThe default reload command is:\n%s\n", reload_command); + exit(EXIT_SUCCESS); +} diff --git a/vswitchd/ovs-vswitchd.8.in b/vswitchd/ovs-vswitchd.8.in new file mode 100644 index 00000000..28e55ba3 --- /dev/null +++ b/vswitchd/ovs-vswitchd.8.in @@ -0,0 +1,87 @@ +.TH ovs\-vswitchd 8 "March 2009" "Open vSwitch" "OpenVSwitch Manual" +.ds PN ovs\-vswitchd +. +.SH NAME +ovs\-vswitchd \- virtual switch daemon +. +.SH SYNOPSIS +.B ovs\-vswitchd +\fIconfig\fR +. +.SH DESCRIPTION +A daemon that manages and controls any number of virtual switches on +the local machine. +.PP +The mandatory \fIconfig\fR argument specifies a configuration file. +For a description of \fBovs\-vswitchd\fR configuration syntax, see +\fBovs\-vswitchd.conf\fR(5). +.PP +At startup or upon receipt of a \fBSIGHUP\fR signal, \fBovs\-vswitchd\fR +reads the configuration file. It sets up Open vSwitch datapaths and then +operates switching across each bridge described in its configuration +files. If a logfile was specified on the command line it will also +be opened or reopened. +.PP +\fBovs\-vswitchd\fR virtual switches may be configured with any of the +following features: +. +.IP \(bu +L2 switching with MAC learning. +. +.IP \(bu +NIC bonding with automatic fail-over and source MAC-based TX load +balancing ("SLB"). +. +.IP \(bu +802.1Q VLAN support. +. +.IP \(bu +Port mirroring, with optional VLAN tagging. +. +.IP \(bu +NetFlow v5 flow logging. +. +.IP \(bu +Connectivity to an external OpenFlow controller, such as NOX. +. +.PP +Only a single instance of \fBovs\-vswitchd\fR is intended to run at a time. +A single \fBovs\-vswitchd\fR can manage any number of virtual switches, up +to the maximum number of supported Open vSwitch datapaths. +.PP +\fBovs\-vswitchd\fR does all the necessary management of OpenVSwitch datapaths +itself. Thus, external tools, such \fBovs\-dpctl\fR(8), are not needed for +managing datapaths in conjunction with \fBovs\-vswitchd\fR, and their use +to modify datapaths when \fBovs\-vswitchd\fR is running can interfere with +its operation. (\fBovs\-dpctl\fR may still be useful for diagnostics.) +.PP +An Open vSwitch datapath kernel module must be loaded for \fBovs\-vswitchd\fR +to be useful. Please refer to the \fBINSTALL\fR file included in the +Open vSwitch distribution for instructions on how to build and load +the Open vSwitch kernel module. +.PP +.SH OPTIONS +.IP "\fB--fake-proc-net\fR" +Causes \fBovs\-vswitchd\fR to simulate some files in \fB/proc/net/vlan\fR +and \fB/proc/net/bonding\fR that some legacy software expects to +exist. This option should only be used if such legacy software is +actually in use. It requires the \fBbrcompat_mod.ko\fR kernel module +to be loaded. +. +.so lib/daemon.man +.so lib/vlog.man +.so lib/common.man +.so lib/leak-checker.man +. +.SH "BUGS" +. +Only Open vSwitch kernel-based datapaths are currently supported. In the +future, this restriction may be lifted. +.PP +Only Linux 2.6.\fIx\fR is currently supported. +. +.SH "SEE ALSO" +.BR ovs\-appctl (8), +.BR ovs\-vswitchd.conf (5), +.BR ovs\-brcompatd (8), +\fBINSTALL\fR in the Open vSwitch distribution. diff --git a/vswitchd/ovs-vswitchd.c b/vswitchd/ovs-vswitchd.c new file mode 100644 index 00000000..9528ec5f --- /dev/null +++ b/vswitchd/ovs-vswitchd.c @@ -0,0 +1,255 @@ +/* Copyright (c) 2008, 2009 Nicira Networks + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * In addition, as a special exception, Nicira Networks gives permission + * to link the code of its release of vswitchd with the OpenSSL project's + * "OpenSSL" library (or with modified versions of it that use the same + * license as the "OpenSSL" library), and distribute the linked + * executables. You must obey the GNU General Public License in all + * respects for all of the code used other than "OpenSSL". If you modify + * this file, you may extend this exception to your version of the file, + * but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. + */ + +#include <config.h> + +#include <assert.h> +#include <errno.h> +#include <getopt.h> +#include <limits.h> +#include <signal.h> +#include <stdlib.h> +#include <string.h> + +#include "bridge.h" +#include "cfg.h" +#include "command-line.h" +#include "compiler.h" +#include "daemon.h" +#include "fault.h" +#include "leak-checker.h" +#include "mgmt.h" +#include "ovs-vswitchd.h" +#include "poll-loop.h" +#include "port.h" +#include "proc-net-compat.h" +#include "process.h" +#include "signals.h" +#include "svec.h" +#include "timeval.h" +#include "unixctl.h" +#include "util.h" +#include "vconn-ssl.h" +#include "vconn.h" + +#include "vlog.h" +#define THIS_MODULE VLM_vswitchd + +static void parse_options(int argc, char *argv[]); +static void usage(void) NO_RETURN; +static void reload(struct unixctl_conn *, const char *args); + +static bool need_reconfigure; +static struct unixctl_conn **conns; +static size_t n_conns; + +int +main(int argc, char *argv[]) +{ + struct unixctl_server *unixctl; + struct signal *sighup; + int retval; + + set_program_name(argv[0]); + register_fault_handlers(); + time_init(); + vlog_init(); + parse_options(argc, argv); + signal(SIGPIPE, SIG_IGN); + sighup = signal_register(SIGHUP); + process_init(); + + die_if_already_running(); + daemonize(); + + retval = unixctl_server_create(NULL, &unixctl); + if (retval) { + ovs_fatal(retval, "could not listen for control connections"); + } + unixctl_command_register("vswitchd/reload", reload); + + cfg_read(); + mgmt_init(); + bridge_init(); + port_init(); + mgmt_reconfigure(); + + need_reconfigure = false; + for (;;) { + if (need_reconfigure || signal_poll(sighup)) { + need_reconfigure = false; + vlog_reopen_log_file(); + reconfigure(); + } + mgmt_run(); + if (bridge_run()) { + need_reconfigure = true; + } + unixctl_server_run(unixctl); + + if (need_reconfigure) { + poll_immediate_wake(); + } + signal_wait(sighup); + mgmt_wait(); + bridge_wait(); + unixctl_server_wait(unixctl); + poll_block(); + } + + return 0; +} + +static void +reload(struct unixctl_conn *conn, const char *args UNUSED) +{ + need_reconfigure = true; + conns = xrealloc(conns, sizeof *conns * (n_conns + 1)); + conns[n_conns++] = conn; +} + +void +reconfigure(void) +{ + size_t i; + + cfg_read(); + bridge_reconfigure(); + mgmt_reconfigure(); + port_reconfigure(); + + for (i = 0; i < n_conns; i++) { + unixctl_command_reply(conns[i], 202, NULL); + } + free(conns); + conns = NULL; + n_conns = 0; +} + +static void +parse_options(int argc, char *argv[]) +{ + enum { + OPT_PEER_CA_CERT = UCHAR_MAX + 1, + OPT_FAKE_PROC_NET, + VLOG_OPTION_ENUMS, + LEAK_CHECKER_OPTION_ENUMS + }; + static struct option long_options[] = { + {"help", no_argument, 0, 'h'}, + {"version", no_argument, 0, 'V'}, + {"fake-proc-net", no_argument, 0, OPT_FAKE_PROC_NET}, + DAEMON_LONG_OPTIONS, + VLOG_LONG_OPTIONS, + LEAK_CHECKER_LONG_OPTIONS, +#ifdef HAVE_OPENSSL + VCONN_SSL_LONG_OPTIONS + {"peer-ca-cert", required_argument, 0, OPT_PEER_CA_CERT}, +#endif + {0, 0, 0, 0}, + }; + char *short_options = long_options_to_short_options(long_options); + const char *config_file; + int error; + + for (;;) { + int c; + + c = getopt_long(argc, argv, short_options, long_options, NULL); + if (c == -1) { + break; + } + + switch (c) { + case 'H': + case 'h': + usage(); + + case 'V': + OVS_PRINT_VERSION(OFP_VERSION, OFP_VERSION); + exit(EXIT_SUCCESS); + + case OPT_FAKE_PROC_NET: + error = proc_net_compat_init(); + if (error) { + ovs_fatal(error, "failed to initialize /proc/net " + "compatibility"); + } + break; + + VLOG_OPTION_HANDLERS + DAEMON_OPTION_HANDLERS + VCONN_SSL_OPTION_HANDLERS + LEAK_CHECKER_OPTION_HANDLERS + +#ifdef HAVE_OPENSSL + case OPT_PEER_CA_CERT: + vconn_ssl_set_peer_ca_cert_file(optarg); + break; +#endif + + case '?': + exit(EXIT_FAILURE); + + default: + abort(); + } + } + free(short_options); + + argc -= optind; + argv += optind; + + if (argc != 1) { + ovs_fatal(0, "config file is only non-option argument; " + "use --help for usage"); + } + + config_file = argv[0]; + error = cfg_set_file(config_file); + if (error) { + ovs_fatal(error, "failed to add configuration file \"%s\"", + config_file); + } +} + +static void +usage(void) +{ + printf("%s: virtual switch daemon\n" + "usage: %s [OPTIONS] CONFIG\n" + "CONFIG is a configuration file in ovs-vswitchd.conf(5) format.\n", + program_name, program_name); + daemon_usage(); + vlog_usage(); + printf("\nLegacy compatibility options:\n" + " --fake-proc-net simulate some files in /proc/net\n" + "\nOther options:\n" + " -h, --help display this help message\n" + " -V, --version display version information\n"); + leak_checker_usage(); + exit(EXIT_SUCCESS); +} diff --git a/vswitchd/ovs-vswitchd.conf.5.in b/vswitchd/ovs-vswitchd.conf.5.in new file mode 100644 index 00000000..89872184 --- /dev/null +++ b/vswitchd/ovs-vswitchd.conf.5.in @@ -0,0 +1,642 @@ +.\" -*- nroff -*- +.de TQ +. br +. ns +. TP "\\$1" +.. +.de IQ +. br +. ns +. IP "\\$1" +.. +.de ST +. PP +. RS -0.15in +. I "\\$1" +. RE +. PP +.. +.TH ovs\-vswitchd.conf 5 "April 2009" "Open vSwitch" "OpenVSwitch Manual" +. +.SH NAME +ovs\-vswitchd.conf \- configuration file for \fBovs\-vswitchd\fR +. +.SH DESCRIPTION +This manual page describes the syntax for the configuration file used +by \fBovs\-vswitchd\fR(8), the virtual switch daemon. +.PP +The configuration file is based on key-value pairs, which are given +one per line in the form \fIkey\fB=\fIvalue\fR. Each \fIkey\fR +consists of one or more parts separated by dots, +e.g. \fIpart1\fB.\fIpart2\fB.\fIpart3\fR. Each \fIpart\fR may consist +only of the English letters, digits, and the special characters +\fB_-@$:+\fR. White space within \fIvalue\fR and at the beginning of a +line is significant, but is otherwise ignored. +.PP +If a single key is specified more than once, that key has multiple +values, one value for each time the key is specified. The ordering of +key-value pairs, and the ordering of multiple values for a single key, +within a configuration file is not significant. +.PP +Blank lines, lines that consist only of white space, and lines that +begin with \fB#\fR (optionally preceded by white space) are ignored. +Keep in mind that programs that modify the configuration file, such as +\fBovs\-brcompatd\fR and \fBovs-cfg-mod\fR, may alter the order of +elements and +strip comments and blank lines. +.PP +The following subsections describe how key-value pairs are used to +configure \fBovs\-vswitchd\fR. +.SS "Bridge Configuration" +A bridge (switch) with a given \fIname\fR is configured by specifying +the names of its network devices as values for key +\fBbridge.\fIname\fB.port\fR. (The specified \fIname\fR may not begin +with \fBdp\fR or \fBnl:\fR followed by a digit.) +.PP +The names given on \fBbridge.\fIname\fB.port\fR must be the names of +existing network devices, except for ``internal ports.'' An internal +port is a simulated network device that receives traffic only +through the virtual switch and switches any traffic sent it through +virtual switch. An internal port may configured with an IP address, +etc. using the usual system tools (e.g. \fBifconfig\fR, \fBip\fR). To +designate network device \fInetdev\fR as an internal port, add +\fBiface.\fInetdev\fB.internal=true\fR to the configuration file. +\fBovs\-vswitchd\fR will honor this configuration setting by automatically +creating the named internal port. +.PP +A bridge with a given \fIname\fR always has an internal port with the +same \fIname\fR, called the ``local port.'' This network device may +be included +in the bridge, by specifying it as one of the values for key +\fBbridge.\fIname\fB.port\fR, or it may be omitted. If it is +included, then its MAC address is by default the lowest-numbered MAC +address among the other bridge ports, ignoring other internal ports +and bridge ports that are +used as port mirroring destinations (see \fBPort Mirroring\fR, below). To +use a specific MAC address instead, set \fBbridge.\fIname\fB.mac\fR to +a MAC address in the format +\fIxx\fB:\fIxx\fB:\fIxx\fB:\fIxx\fB:\fIxx\fB:\fIxx\fR, where each +\fIx\fR is a hex digit. If no valid MAC address can be determined +either of these ways, then a MAC address is randomly generated. +.PP +The following syntax defines a bridge named \fBmybr\fR, configured +with network devices \fBeth0\fR, \fBeth1\fR, and \fBeth2\fR: +.RS +.nf + +bridge.mybr.port=eth0 +bridge.mybr.port=eth1 +bridge.mybr.port=eth2 + +.fi +.RE +.SS "802.1Q VLAN support" +A bridge port may be configured either as a trunk port or as belonging +to a single, untagged VLAN. These two options are mutually exclusive, +and a port must be configured in one way or the other. +.ST "Trunk Ports" +By default, bridge ports are trunk ports that carry all VLANs. To +limit the VLANs that a trunk port carries, define +\fBvlan.\fIport\fB.trunks\fR to one or more integers between 0 and +4095 designating VLANs. Only frames that have an 802.1Q header with +one of the listed VLANs are accepted on a trunk port. If 0 is +included in the list, then frames without an 802.1Q header are also +accepted. Other frames are discarded. +.PP +The following syntax makes network device \fBeth0\fR a trunk port that +carries VLANs 1, 2, and 3: +.PP +.RS +.nf + +vlan.eth0.trunks=1 +vlan.eth0.trunks=2 +vlan.eth0.trunks=3 + +.fi +.RE +.ST "Untagged VLAN Ports" +A bridge port may be configured with an implicit, untagged VLAN. +Define key +\fBvlan.\fIport\fB.tag\fR to an integer value \fIvid\fR between 0 and +4095, inclusive, to designate the named \fIport\fR as a member +of 802.1Q VLAN \fIvid\fR. When \fIport\fR is assigned a VLAN tag this +way, frames arriving on trunk ports will be forwarded to \fIport\fR +only if they are tagged with VLAN \fIvid\fR, and frames arriving on +other VLAN ports will be forwarded to \fIport\fR only if their +\fIvid\fR values are equal. Frames forwarded to \fIport\fR will not +have an 802.1Q header. +.PP +When \fIvid\fR is 0, frames arriving on trunk ports without an 802.1Q +VLAN header will also be forwarded to \fIport\fR. +.PP +When a frame with a 802.1Q header that indicates a nonzero VLAN is +received on an implicit VLAN port, it is discarded. +.PP +The following syntax makes network device \fBeth0\fR a member of VLAN +101: +.PP +.RS +.nf + +vlan.eth0.tag=101 + +.fi +.RE +.SS "Network Device Bonding" +Bonding allows multiple ``slave'' network devices to be treated as if +they were a single virtual ``bonded'' network device. It is useful for +load balancing and fail-over. +.PP +\fBovs\-vswitchd\fR supports ``source load balancing'' (SLB) bonding, which +assigns flows to slaves based on source MAC address, with periodic +rebalancing as traffic patterns change. This form of bonding does not +require 802.3ad or other special support from the upstream switch to +which the slave devices are connected. +.PP +To configure bonding, create a virtual bonding device by specifying +the slave network device names as values for +\fBbonding.\fIname\fB.slave\fR, then specify \fIname\fR as a bridge +port. The chosen \fIname\fR should not be the name of any real +network device on the host system. +.PP +By default, bonding interfaces are enabled or disabled immediately +when a carrier is detected or dropped on the underlying network +device. To insert a delay when carrier comes up or goes down before +enabling or disabling an interface, set the value of +\fBbonding.\fIname\fB.updelay\fR or +\fBbonding.\fIname\fB.downdelay\fR, respectively, to a positive +integer, interpreted in milliseconds. +.PP +The following syntax bonds \fBeth0\fR and \fBeth1\fR into a bonding +device named \fBbond0\fR, which is added to bridge \fBmybr\fR along +with physical network devices \fBeth2\fR and \fBeth3\fR: +.PP +.RS +.nf + +bridge.mybr.port=bond0 +bridge.mybr.port=eth2 +bridge.mybr.port=eth3 + +bonding.bond0.slave=eth0 +bonding.bond0.slave=eth1 + +.fi +.RE +.SS "Port Mirroring (SPAN and RSPAN)" +\fBovs\-vswitchd\fR may be configured to send selected frames to special +``mirrored'' ports, in addition to their normal destinations. Mirroring +traffic may also be referred to as SPAN or RSPAN, depending on the +mechanism used for delivery. +.PP +Up to 32 instances of port mirroring may be configured on a given +bridge. Each must be given a name that is unique within the bridge. +The keys associated with port mirroring instance \fIpmname\fR for +bridge \fIbrname\fR begin with \fBmirror.\fIbrname\fB.\fIpmname\fR. +.PP +The selection of the frames to mirror and the form in which they +should be output is configured separately for each port mirroring +instances, through a subsection of +\fBmirror.\fIbrname\fB.\fIpmname\fR, named \fBselect\fR, and +\fBoutput\fR, respectively. +.ST "Selecting Frames to Mirror" +The values for the following keys, if specified, limit the frames that +are chosen for mirroring. If none of these keys is specified, then +all frames received by the bridge are mirrored. If more than one of +these keys is specified, then a frame must meet all specified criteria +to be mirrored. +.TP +\fBmirror.\fIbrname\fB.\fIpmname\fB.select.src-port=\fIport\fR +.TQ +\fBmirror.\fIbrname\fB.\fIpmname\fB.select.dst-port=\fIport\fR +.TQ +\fBmirror.\fIbrname\fB.\fIpmname\fB.select.port=\fIport\fR +Frame received on \fIport\fR, output to \fIport\fR, or either received +on or output to \fIport\fR, respectively. \fIport\fR must be part of +the bridge \fIbrname\fR; that is, it must be listed on +\fBbridge.\fIbrname\fB.port\fR. +.TP +\fBmirror.\fIbrname\fB.\fIpmname\fB.select.vlan=\fIvid\fR +. +\fIvid\fR must be an integer between 0 and 4095, inclusive. A nonzero +\fIvid\fR selects frames that belong to VLAN \fIvid\fR, that is, +frames that arrived on a trunk port tagged with VLAN \fIvid\fR or on a +port that is configured as part of VLAN \fIvid\fR (see \fB802.1Q VLAN +tagging\fR, above). A \fIvid\fR of zero selects frames that do not +belong to a VLAN, that is, frames that arrived on a trunk port without +a VLAN tag or tagged with VLAN 0. +.ST "Mirror Output" +The values of the following keys determine how frames selected for +mirroring are output. Only one of the keys may be specified. +.TP +\fBmirror.\fIbrname\fB.\fIpmname\fB.output.port=\fIport\fR +. +Causes the selected frames to be sent out \fIport\fR, which must be +part of the bridge \fIbrname\fR; that is, it must be listed on +\fBbridge.\fIbrname\fB.port\fR. +.IP +Specifying a \fIport\fR in this way reserves that port exclusively for +mirroring. No frames other than those selected for mirroring will be +forwarded to \fIport\fR, and any frames received on \fIport\fR will be +discarded. This type of mirroring may be referred to as SPAN. +.TP +\fBmirror.\fIbrname\fB.\fIpmname\fB.output.vlan=\fIvid\fR +. +Causes the selected frames to be sent on the VLAN numbered \fIvid\fR, +which must be an integer between 0 and 4095, inclusive. The frames +will be sent out all ports that trunk VLAN \fIvid\fR, as well as any +ports with implicit VLAN \fIvid\fR. When a mirrored frame is sent out +a trunk port, the frame's VLAN tag will be set to \fIvid\fR, replacing +any existing tag; when it is sent out an implicit VLAN port, the frame +will not be tagged. This type of mirroring may be referred to as +RSPAN. +.ST "Example" +The following \fBovs\-vswitchd\fR configuration copies all frames received +on \fBeth1\fR or \fBeth2\fR to \fBeth3\fR. +.PP +.RS +.nf + +bridge.mybr.port=eth1 +bridge.mybr.port=eth2 +bridge.mybr.port=eth3 + +mirror.mybr.a.select.src-port=eth1 +mirror.mybr.a.select.src-port=eth2 +mirror.mybr.a.output.port=eth3 + +.fi +.RE +.SS "Port Rate-Limiting" +Traffic policing and shaping are configured on physical ports. Policing +defines a hard limit at which traffic that exceeds the specified rate is +dropped. Shaping uses queues to delay packets so that egress traffic +leaves at the specified rate. + +.ST "Ingress Policing" +The rate at which traffic is allowed to enter through a particular +physical port can be configured with ingress policing. The rate is +specified in kilobits (1000 bits) per second with a maximum burst size +specified in kilobits (1000 bits). The burst size should be at least +the size of the port's MTU. + +A port may be configured to enforce ingress policing by defining the +key \fBport.\fIname\fB.ingress.policing-rate\fR with an integer +indicating the rate. The port \fIname\fR will only allow traffic to be +received at the rate specified in kilobits per second. If the rate is zero +or the key is not defined, then ingress policing is disabled. + +If ingress policing is enabled, then the burst rate may be set by defining +the key \fBport.\fIname\fB.ingress.policing-burst\fR with an integer +indicating the burst rate in kilobits. If the key is not supplied or is +zero, then the default burst is 10 kilobits. + +.PP +The following syntax limits port \fBeth1\fR to receiving traffic at +\fB512\fR kilobits per second with a burst of \fB20\fR kilobits: +.PP +.RS +.nf + +port.eth1.ingress.policing-rate=512 +port.eth1.ingress.policing-burst=20 + +.fi +.SS "NetFlow v5 Flow Logging" +NetFlow is a protocol that exports a number of details about terminating +IP flows, such as the principals involved and duration. A bridge may be +configured to send NetFlow v5 records to NetFlow collectors when flows +end. To enable, define the key \fBnetflow.\fIbridge\fB.host\fR for each +collector in the form \fIhost\fB:\fIport\fR. Records from \fIbridge\fR +will be sent to each \fIhost\fR on UDP \fIport\fR. + +The NetFlow messages will use the datapath index for the engine type and id. +This can be overridden with the \fBnetflow.\fIbridge\fB.engine-type\fR and +\fBnetflow.\fIbridge\fB.engine-id\fR, respectively. Each takes a value +between 0 and 255, inclusive. + +Many NetFlow collectors do not expect multiple virtual switches to be +sending messages from the same host, and they do not store the engine +information which could be used to disambiguate the traffic. To prevent +flows from multiple switches appearing as if they came on the interface, +add \fBnetflow.\fIbridge\fB.add-id-to-iface=true\fR to the configuration +file. This will place the least significant 7 bits of the engine id +into the most significant bits of the ingress and egress interface fields +of flow records. By default, this behavior is disabled. + +The following syntax sends NetFlow records for \fBmybr\fR to the NetFlow +collector \fBnflow.example.com\fR on UDP port \fB9995\fR: +.PP +.RS +.nf + +netflow.mybr.host=nflow.example.com:9995 + +.fi +.RE +.SS "Remote Management" +A \fBovs\-vswitchd\fR instance may be remotely managed by a controller that +supports the OpenFlow Management Protocol, such as NOX. This +functionality is enabled by setting the key \fBmgmt.controller\fR to one +of the following values: +. +.TP +\fBssl:\fIhost\fR[\fB:\fIport\fR] +The specified SSL \fIport\fR (default: 6633) on the given remote +\fIhost\fR. SSL must be configured when this form is used (see \fBSSL +Configuration\fR, below). +. +.TP +\fBtcp:\fIhost\fR[\fB:\fIport\fR] +The specified TCP \fIport\fR (default: 6633) on the given remote +\fIhost\fR. +.PP +The maximum time between attempts to connect to the controller may be +specified in integral seconds with the \fBmgmt.max-backoff\fR key. The +default maximum backoff is 15 seconds, and the minimum value is 1 +second. + +An inactivity probe may be configured with the \fBmgmt.inactivity-probe\fR +key. If \fBovs\-vswitchd\fR does not communicate with the controller for the +specified number of seconds, it will send a probe. If a response is not +received for an additional amount of that time, \fBovs\-vswitchd\fR assumes +the connection has been broken and attempts to reconnect. The default +is 15 seconds, and the minimum value is 5 seconds. + +A management id may be specified with the \fBmgmt.id\fR key. It takes +an id in the form of exactly 12 hexadecimal digits. If one is not +specified, a random id is generated each time \fBovs\-vswitchd\fR is started. +.fi +.RE +.SS "OpenFlow Controller Connectivity" +\fBovs\-vswitchd\fR can perform all configured bridging and switching +locally, or it can be configured to connect a given bridge to an +external OpenFlow controller, such as NOX. Its behavior depends on +the \fBbridge.\fIname\fB.controller\fR setting: +. +.TP +\fI\[la]unset\[ra]\fR +When the key is not set, the behavior depends on whether remote +management is configured. If management is configured, then the switch +will connect to the controller specified on \fBmgmt.controller\fR. If +management is not configured, the switch will perform all configured +bridging and switching locally. +. +.TP +\fI\[la]empty\[ra]\fR +Setting an empty string value disables controller connectivity. The +switch will perform all configured bridging and switching locally. +. +.TP +\fBdiscover\fR +Use controller discovery to find the local OpenFlow controller. +Refer to \fBsecchan\fR(8) for information on how to configure a DHCP +server to support controller discovery. The following additional +options control the discovery process: +. +.RS +.TP +\fBbridge.\fIname\fB.controller.accept-regex=\fIregex\fR +A POSIX extended regular expression against which the discovered +controller location is validated. Only controllers whose names match +the regular expression will be accepted. +.IP +The default regular expression is \fBssl:.*\fR, meaning that only SSL +controller connections will be accepted, when SSL is configured (see +\fBSSL Configuration\fR), and \fB.*\fR otherwise, meaning that any +controller will be accepted. +.IP +The regular expression is implicitly anchored at the beginning of the +controller location string, as if it begins with \fB^\fR. +.TP +\fBbridge.\fIname\fB.controller.update-resolv.conf=\fBtrue\fR|\fBfalse\fR +By default, or if this is set to \fBtrue\fR, \fBovs\-vswitchd\fR overwrites +the system's \fB/etc/resolv.conf\fR with domain information and DNS +servers obtained via DHCP. If this setting is \fBfalse\fR, +\fBovs\-vswitchd\fR will not modify \fB/etc/resolv.conf\fR. +.IP +\fBovs\-vswitchd\fR will only modify \fBresolv.conf\fR if the DHCP response +that it receives specifies one or more DNS servers. +.RE +. +.TP +\fBssl:\fIhost\fR[\fB:\fIport\fR] +The specified SSL \fIport\fR (default: 6633) on the given remote +\fIhost\fR. SSL must be configured when this form is used (see \fBSSL +Configuration\fR, below). +. +.TP +\fBtcp:\fIhost\fR[\fB:\fIport\fR] +The specified TCP \fIport\fR (default: 6633) on the given remote +\fIhost\fR. +. +.TP +\fBunix:\fIfile\fR +The Unix domain server socket named \fIfile\fR. +.PP +The datapath ID used by the bridge to identify itself to the remote +controller may be specified as \fBbridge.\fIname\fB.datapath-id\fR, +in the form of exactly 12 hexadecimal digits. If the datapath ID +is not specified, then it defaults to the bridge's MAC address (see +\fBBridge Configuration\fR, above, for information on how the bridge's +MAC address is chosen). +.ST "Local Port Network Configuration" +When an external controller is configured, but controller discovery is +not in use, the following additional settings are honored: +.TP +\fBbridge.\fIname\fB.controller.in-band=\fBtrue\fR|\fBfalse\fR +By default, or if this is set to \fBtrue\fR, \fBovs\-vswitchd\fR connects +to the controller in-band. If this is set to \fBfalse\fR, +\fBovs\-vswitchd\fR connects to the controller out-of-band. Refer to +\fBsecchan\fR(8) for a description of in-band and out-of-band control. +.IP "\fBbridge.\fIname\fB.controller.ip=\fIip\fR" +If specified, the IP address to configure on the bridge's local port. +.IP "\fBbridge.\fIname\fB.controller.netmask=\fInetmask\fR" +When an IP is specified, the corresponding netmask. The default is +255.255.255.0 for a Class C IP address, 255.255.0.0 for Class B, and +255.0.0.0 for Class A. +.IP "\fBbridge.\fIname\fB.controller.gateway=\fIip\fR" +When an IP is specified, the corresponding IP gateway. There is no +default gateway. +.ST "Controller Failure Settings" +The following additional settings take effect when any remote +controller is configured: +.IP "\fBbridge.\fIname\fB.controller.inactivity-probe=\fIsecs\fR" +This optional setting may be set to \fIsecs\fR, a number of seconds. +The minimum value of \fIsecs\fR is 5 seconds. The default is taken +from \fBmgmt.inactivity-probe\fR (see above). +.IP +When the virtual switch is connected to the controller, it waits for a +message to be received from the controller for \fIsecs\fR seconds +before it sends a inactivity probe to the controller. After sending +the inactivity probe, if no response is received for an additional +\fIsecs\fR seconds, the secure channel assumes that the connection has +been broken and attempts to reconnect. +.IP +Changing the inactivity probe interval also changes the interval +before entering standalone mode (see below). +.IP "\fBbridge.\fIname\fB.controller.fail-mode=\fBstandalone\fR|\fBsecure\fR" +.IQ "\fBmgmt.fail-mode=standalone\fR|\fBsecure\fR" +When a controller is configured, it is, ordinarily, responsible for +setting up all flows on the virtual switch. Thus, if the connection to +the controller fails, no new network connections can be set up. If +the connection to the controller stays down long enough, no packets +can pass through the switch at all. +.IP +The first of these that is set takes effect. +If the value is \fBstandalone\fR, \fBovs\-vswitchd\fR will take over +responsibility for setting up +flows when no message has been received from the controller for three +times the inactivity probe interval (see above). In this mode, +\fBovs\-vswitchd\fR causes the datapath to act like an ordinary +MAC-learning switch. \fBovs\-vswitchd\fR will continue to retry connecting +to the controller in the background and, when the connection succeeds, +it discontinues its standalone behavior. +.IP +If this option is set to \fBsecure\fR, or if neither of these settings +is set, \fBovs\-vswitchd\fR will not set up flows on its own when the +controller connection fails. +.IP "\fBbridge.\fIname\fB.controller.max-backoff=\fIsecs\fR" +Sets the maximum time between attempts to connect to the controller to +\fIsecs\fR, which must be at least 1. The actual interval between +connection attempts starts at 1 second and doubles on each failing +attempt until it reaches the maximum. The default maximum backoff +time is taken from \fBmgmt.max-backoff\fR. +.ST "Controller Rate-Limiting" +These settings configure how the virtual switch applies a ``token +bucket'' to limit the rate at which packets in unknown flows are +forwarded to the OpenFlow controller for flow-setup processing. This +feature prevents a single bridge from overwhelming a controller. +.IP "\fBbridge.\fIname\fB.controller.rate-limit=\fIrate\fR" +.IQ "\fBmgmt.rate-limit=\fIrate\fR" +Limits the maximum rate at which packets will be forwarded to the +OpenFlow controller to \fIrate\fR packets per second. A rate specified +explicitly for \fIname\fR overrides a value configured using the +\fBmgmt.rate-limit\fR key. +.IP +If neither one of these settings is set, then the bridge does not +limit the rate at which packets are forwarded to the controller. +.IP "\fBbridge.\fIname\fB.controller.burst-limit=\fIburst\fR" +.IQ "\fBmgmt.burst-limit=\fIburst\fR" +Sets the maximum number of unused packet credits that the bridge will +allow to accumulate during the time in which no packets are being +forwarded to the OpenFlow controller to \fIburst\fR (measured in +packets). The default \fIburst\fR is one-quarter of the \fIrate\fR +specified in the rate-limit setting. +.IP +A burst specified explicitly for \fIname\fR overrides a value configured +using the \fBmgmt.burst-limit\fR key. This option takes effect only +when a rate-limit is specified. +.ST "Remote Command Execution Settings" +These settings configure the commands that remote OpenFlow connections +are allowed to invoke using (e.g.) \fBovs\-ofctl execute\fR. To be +permitted, a command name must be whitelisted and must not be +blacklisted. When the whitelist and blacklist permit a command name, +\fBovs\-vswitchd\fR looks for a program with the same name as the command +in the commands directory (see below). Other directories are not +searched. +.IP "\fBbridge.\fIname\fB.controller.commands.acl=\fIglob\fR" +Whitelists commands whose names match shell glob pattern \fIglob\fR, +allowing those commands to be invoked by the remote controller. +.IP +By default, no commands are whitelisted, so this setting is mandatory +if any remote command execution is to be allowed. +.IP "\fBbridge.\fIname\fB.controller.commands.acl=\fB!\fR\fIglob\fR" +Blacklists commands whose names match shell glob pattern \fIglob\fR, +prohibiting those commands from being invoked by the remote +controller. Command names that include characters other than upper- +and lower-case English letters, digits, and the underscore and hyphen +characters are blacklisted unconditionally. +.IP "\fBbridge.\fIname\fB.controller.commands.dir=\fIdirectory\fR" +Sets the directory searched for remote command execution to +\fIdirectory\fR. The default directory is +\fB@pkgdatadir@/commands\fR. +.SS "SSL Configuration" +When \fBovs\-vswitchd\fR is configured to connect over SSL for management or +for controller connectivity, the following settings are required: +.TP +\fBssl.private-key=\fIprivkey.pem\fR +Specifies a PEM file containing the private key used as the virtual +switch's identity for SSL connections to the controller. +.TP +\fBssl.certificate=\fIcert.pem\fR +Specifies a PEM file containing a certificate, signed by the +certificate authority (CA) used by the controller and manager, that +certifies the virtual switch's private key, identifying a trustworthy +switch. +.TP +\fBssl.ca-cert=\fIcacert.pem\fR +Specifies a PEM file containing the CA certificate used to verify that +the virtual switch is connected to a trustworthy controller. +.PP +These files are read only once, at \fBovs\-vswitchd\fR startup time. If +their contents change, \fBovs\-vswitchd\fR must be killed and restarted. +.PP +These SSL settings apply to all SSL connections made by the virtual +switch. +.ST "CA Certificate Bootstrap" +Ordinarily, all of the files named in the SSL configuration must exist +when \fBovs\-vswitchd\fR starts. However, if \fBssl.bootstrap-ca-cert\fR +is set to \fBtrue\fR, then \fBovs\-vswitchd\fR will attempt to obtain the +CA certificate from the controller on its first SSL connection and +save it to the named PEM file. If it is successful, it will +immediately drop the connection and reconnect, and from then on all +SSL connections must be authenticated by a certificate signed by the +CA certificate thus obtained. +.PP +\fBThis option exposes the SSL connection to a man-in-the-middle +attack obtaining the initial CA certificate\fR, but it may be useful +for bootstrapping. +.PP +This option is only useful if the controller sends its CA certificate +as part of the SSL certificate chain. The SSL protocol does not +require the controller to send the CA certificate, but +\fBcontroller\fR(8) can be configured to do so with the +\fB--peer-ca-cert\fR option. +.SS "OpenFlow Management Connections" +By default, each bridge \fIname\fR listens for OpenFlow management +connections on a Unix domain socket named +\fB@RUNDIR@/\fIname\fB.mgmt\fR. This socket can be used to perform +local OpenFlow monitoring and administration, e.g., \fBovs\-ofctl dump-flows +unix:@RUNDIR@/\fIname\fB.mgmt\fR to display the flows currently set up +in bridge \fIname\fR. +.PP +If \fBbridge.\fIname\fB.openflow.listeners\fR is set to one or more +values, \fBovs\-vswitchd\fR instead listens on the specified connection +methods. Acceptable connection methods include: +.RS +.IP "\fBpunix:\fIfile\fR" +Listens for connections on the Unix domain server socket named \fIfile\fR. +.IP "\fBpssl:\fR[\fIport\fR]" +Listens for SSL connections on \fIport\fR (default: 6633). SSL must +be configured when this form is used (see \fBSSL Configuration\fR, +above). +.IP "\fBptcp:\fR[\fIport\fR]" +Listens for TCP connections on \fIport\fR (default: 6633). +.RE +To entirely disable listening for management connections, set +\fBbridge.\fIname\fB.openflow.listeners\fR to the single value +\fBnone\fR. + +.SS "OpenFlow Controller Connection Snooping" +By default, each bridge \fIname\fR listens for OpenFlow controller +connection snooping connections on a Unix domain socket named +\fB@RUNDIR@/\fIname\fB.snoop\fR. A client that connects to this +socket, e.g., \fBovs\-ofctl monitor unix:@RUNDIR@/\fIname\fB.snoop\fR, will +receive a copy of every OpenFlow message sent by the switch to the +controller, or vice versa, on the primary OpenFlow controller +connection. +.PP +If \fBbridge.\fIname\fB.openflow.snoops\fR is set to one or more +values, \fBovs\-vswitchd\fR instead listens on the specified connection +methods. The acceptable connection methods are the same as for +OpenFlow management connections (see above). +.PP +To entirely disable controller connection snooping, set +\fBbridge.\fIname\fB.openflow.snoops\fR to the single value +\fBnone\fR. +.SH "SEE ALSO" +.BR ovs\-brcompatd (8), +.BR ovs\-cfg\-mod (8), +.BR ovs\-vswitchd (8) diff --git a/vswitchd/ovs-vswitchd.h b/vswitchd/ovs-vswitchd.h new file mode 100644 index 00000000..f292d682 --- /dev/null +++ b/vswitchd/ovs-vswitchd.h @@ -0,0 +1,32 @@ +/* Copyright (c) 2009 Nicira Networks + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * In addition, as a special exception, Nicira Networks gives permission + * to link the code of its release of vswitchd with the OpenSSL project's + * "OpenSSL" library (or with modified versions of it that use the same + * license as the "OpenSSL" library), and distribute the linked + * executables. You must obey the GNU General Public License in all + * respects for all of the code used other than "OpenSSL". If you modify + * this file, you may extend this exception to your version of the file, + * but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. + */ + +#ifndef VSWITCHD_H +#define VSWITCHD_H 1 + +void reconfigure(void); + +#endif /* ovs-vswitchd.h */ diff --git a/vswitchd/port.c b/vswitchd/port.c new file mode 100644 index 00000000..f6348f35 --- /dev/null +++ b/vswitchd/port.c @@ -0,0 +1,68 @@ +/* Copyright (c) 2009 Nicira Networks + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * In addition, as a special exception, Nicira Networks gives permission + * to link the code of its release of vswitchd with the OpenSSL project's + * "OpenSSL" library (or with modified versions of it that use the same + * license as the "OpenSSL" library), and distribute the linked + * executables. You must obey the GNU General Public License in all + * respects for all of the code used other than "OpenSSL". If you modify + * this file, you may extend this exception to your version of the file, + * but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. + * + */ + +#include <config.h> + +#include "bridge.h" +#include "cfg.h" +#include "netdev.h" +#include "ovs-vswitchd.h" +#include "port.h" +#include "svec.h" + +#define THIS_MODULE VLM_port +#include "vlog.h" + +static int +set_ingress_policing(const char *port_name) +{ + int kbits_rate = cfg_get_int(0, "port.%s.ingress.policing-rate", + port_name); + int kbits_burst = cfg_get_int(0, "port.%s.ingress.policing-burst", + port_name); + + return netdev_nodev_set_policing(port_name, kbits_rate, kbits_burst); +} + +void +port_init(void) +{ + port_reconfigure(); +} + +void +port_reconfigure(void) +{ + struct svec ports; + int i; + + svec_init(&ports); + bridge_get_ifaces(&ports); + for (i=0; i<ports.n; i++) { + set_ingress_policing(ports.names[i]); + } +} diff --git a/vswitchd/port.h b/vswitchd/port.h new file mode 100644 index 00000000..55c2d7bc --- /dev/null +++ b/vswitchd/port.h @@ -0,0 +1,33 @@ +/* Copyright (c) 2009 Nicira Networks + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * In addition, as a special exception, Nicira Networks gives permission + * to link the code of its release of vswitchd with the OpenSSL project's + * "OpenSSL" library (or with modified versions of it that use the same + * license as the "OpenSSL" library), and distribute the linked + * executables. You must obey the GNU General Public License in all + * respects for all of the code used other than "OpenSSL". If you modify + * this file, you may extend this exception to your version of the file, + * but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. + */ + +#ifndef VSWITCHD_PORT_H +#define VSWITCHD_PORT_H 1 + +void port_init(void); +void port_reconfigure(void); + +#endif /* port.h */ diff --git a/vswitchd/proc-net-compat.c b/vswitchd/proc-net-compat.c new file mode 100644 index 00000000..3f5cf44a --- /dev/null +++ b/vswitchd/proc-net-compat.c @@ -0,0 +1,344 @@ +/* Copyright (c) 2009 Nicira Networks + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + */ + +#include <config.h> +#include "proc-net-compat.h" +#include <assert.h> +#include <dirent.h> +#include <errno.h> +#include <inttypes.h> +#include <string.h> +#include "dynamic-string.h" +#include "hash.h" +#include "netlink-protocol.h" +#include "netlink.h" +#include "ofpbuf.h" +#include "openvswitch/brcompat-netlink.h" +#include "hmap.h" +#include "shash.h" +#include "svec.h" + +#define THIS_MODULE VLM_proc_net_compat +#include "vlog.h" + +/* Netlink socket to bridge compatibility kernel module. */ +static struct nl_sock *brc_sock; + +/* The Generic Netlink family number used for bridge compatibility. */ +static int brc_family = 0; + +/* Rate limiting for log messages. */ +static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5); + +static void flush_dir(const char *dir); +static int set_proc_file(const char *dir, const char *file, const char *data); + +/* Initializes the /proc/net compatibility layer. Returns 0 if successful, + * otherwise a positive errno value. */ +int +proc_net_compat_init(void) +{ + if (!brc_sock) { + int retval = nl_lookup_genl_family(BRC_GENL_FAMILY_NAME, &brc_family); + if (retval) { + return retval; + } + + retval = nl_sock_create(NETLINK_GENERIC, 0, 0, 0, &brc_sock); + if (retval) { + return retval; + } + + flush_dir("/proc/net/vlan"); + flush_dir("/proc/net/bonding"); + } + return 0; +} + +static int +set_proc_file(const char *dir, const char *file, const char *data) +{ + struct ofpbuf request, *reply; + int retval; + + ofpbuf_init(&request, 0); + nl_msg_put_genlmsghdr(&request, brc_sock, 1024, brc_family, NLM_F_REQUEST, + BRC_GENL_C_SET_PROC, 1); + nl_msg_put_string(&request, BRC_GENL_A_PROC_DIR, dir); + nl_msg_put_string(&request, BRC_GENL_A_PROC_NAME, file); + if (data) { + nl_msg_put_string(&request, BRC_GENL_A_PROC_DATA, data); + } + + retval = nl_sock_transact(brc_sock, &request, &reply); + ofpbuf_uninit(&request); + ofpbuf_delete(reply); + if (retval) { + VLOG_WARN_RL(&rl, "failed to %s /proc/%s/%s (%s)", + data ? "update" : "remove", dir, file, strerror(retval)); + } + return retval; +} + +static void +flush_dir(const char *dir) +{ + const char *subdir; + struct dirent *de; + DIR *stream; + + assert(!memcmp(dir, "/proc/", 6)); + subdir = dir + 6; + + stream = opendir(dir); + if (!stream) { + if (errno != ENOENT) { + VLOG_WARN_RL(&rl, "%s: open failed (%s)", dir, strerror(errno)); + } + return; + } + + while ((de = readdir(stream)) != NULL) { + if (strcmp(de->d_name, ".") && strcmp(de->d_name, "..")) { + set_proc_file(subdir, de->d_name, NULL); + } + } + closedir(stream); +} + +/* If 'bond' is nonnull, creates a file in /proc/net/bonding for a bond with + * the given 'name' and the details in 'bond'. If 'bond' is null, deletes + * the /proc/net/bonding file with the given 'name'. + * + * This function has no effect unless proc_net_compat_init() has been + * called. */ +void +proc_net_compat_update_bond(const char *name, const struct compat_bond *bond) +{ + struct ds ds; + int i; + + if (!brc_sock) { + return; + } + + if (!bond) { + set_proc_file("net/bonding", name, NULL); + return; + } + + ds_init(&ds); + ds_put_format( + &ds, + "Ethernet Channel Bonding Driver: ovs-vswitchd " + VERSION BUILDNR" ("__DATE__" "__TIME__")\n" + "Bonding Mode: source load balancing\n" + "Primary Slave: None\n" + "Currently Active Slave: None\n" + "MII Status: %s\n" + "MII Polling Interval (ms): 100\n" + "Up Delay (ms): %d\n" + "Down Delay (ms): %d\n" + "\n" + "Source load balancing info:\n", + bond->up ? "up" : "down", bond->updelay, bond->downdelay); + for (i = 0; i < bond->n_slaves; i++) { + const struct compat_bond_slave *slave = &bond->slaves[i]; + ds_put_format( + &ds, + "\n" + "Slave Interface: %s\n" + "MII Status: %s\n" + "Link Failure Count: 0\n" + "Permanent HW addr: "ETH_ADDR_FMT"\n", + slave->name, slave->up ? "up" : "down", + ETH_ADDR_ARGS(slave->mac)); + } + set_proc_file("net/bonding", name, ds_cstr(&ds)); + ds_destroy(&ds); +} + +/* /proc/net/vlan compatibility. + * + * This is much more complex than I expected it to be. */ + +struct compat_vlan { + /* Hash key. */ + struct hmap_node trunk_node; /* Hash map node. */ + char *trunk_dev; /* Name of trunk network device. */ + int vid; /* VLAN number. */ + + /* Auxiliary data. */ + char *vlan_dev; /* sprintf("%s.%d", trunk_dev, vid); */ + struct svec tagged_devs; /* Name of tagged network device(s). */ +}; + +/* Current set of VLAN devices, indexed two different ways. */ +static struct hmap vlans_by_trunk = HMAP_INITIALIZER(&vlans_by_trunk); +static struct shash vlans_by_tagged = SHASH_INITIALIZER(&vlans_by_tagged); + +static bool remove_tagged_dev(struct shash_node *, const char *tagged_dev); +static void update_vlan_config(void); +static void set_vlan_proc_file(const struct compat_vlan *); +static uint32_t hash_vlan(const char *trunk_dev, uint32_t vid); + +/* Updates the /proc/net/vlan compatibility layer's idea of what trunk device + * and VLAN the given 'tagged_dev' is associated with. If 'tagged_dev' has an + * implicit VLAN tag, then 'trunk_dev' should be the name of a network device + * on the same bridge that trunks that VLAN, and 'vid' should be the VLAN tag + * number. If 'tagged_dev' does not have an implicit VLAN tag, then + * 'trunk_dev' should be NULL and 'vid' should be -1. + * + * This function has no effect unless proc_net_compat_init() has been + * called. */ +void +proc_net_compat_update_vlan(const char *tagged_dev, const char *trunk_dev, + int vid) +{ + struct compat_vlan *vlan; + struct shash_node *node; + + if (!brc_sock) { + return; + } + + /* Find the compat_vlan that we currently have for 'tagged_dev' (if + * any). */ + node = shash_find(&vlans_by_tagged, tagged_dev); + vlan = node ? node->data : NULL; + if (vid <= 0 || !trunk_dev) { + if (vlan) { + if (remove_tagged_dev(node, tagged_dev)) { + update_vlan_config(); + } + } + } else { + if (vlan) { + if (!strcmp(trunk_dev, vlan->trunk_dev) && vid == vlan->vid) { + /* No change. */ + return; + } else { + /* 'tagged_dev' is attached to the wrong compat_vlan. Start + * by removing it from that one. */ + remove_tagged_dev(node, tagged_dev); + node = NULL; + vlan = NULL; + } + } + + /* 'tagged_dev' is not attached to any compat_vlan. Find the + * compat_vlan corresponding to (trunk_dev,vid) to attach it to, or + * create a new compat_vlan if none exists for (trunk_dev,vid). */ + HMAP_FOR_EACH_WITH_HASH (vlan, struct compat_vlan, trunk_node, + hash_vlan(trunk_dev, vid), + &vlans_by_trunk) { + if (!strcmp(trunk_dev, vlan->trunk_dev) && vid == vlan->vid) { + break; + } + } + if (!vlan) { + /* Create a new compat_vlan for (trunk_dev,vid). */ + vlan = xcalloc(1, sizeof *vlan); + vlan->trunk_dev = xstrdup(trunk_dev); + vlan->vid = vid; + vlan->vlan_dev = xasprintf("%s.%d", trunk_dev, vid); + svec_init(&vlan->tagged_devs); + hmap_insert(&vlans_by_trunk, &vlan->trunk_node, + hash_vlan(trunk_dev, vid)); + set_vlan_proc_file(vlan); + } + + /* Attach 'tagged_dev' to 'vlan'. */ + svec_add(&vlan->tagged_devs, tagged_dev); + shash_add(&vlans_by_tagged, tagged_dev, vlan); + svec_sort(&vlan->tagged_devs); + update_vlan_config(); + } +} + +/* Remove 'tagged_dev' from the compat_vlan in 'node'. If that causes the + * compat_vlan to have no tagged_devs left, destroy the compat_vlan too. */ +static bool +remove_tagged_dev(struct shash_node *node, const char *tagged_dev) +{ + struct compat_vlan *vlan = node->data; + + svec_del(&vlan->tagged_devs, tagged_dev); + shash_delete(&vlans_by_tagged, node); + if (!vlan->tagged_devs.n) { + set_proc_file("net/vlan", vlan->vlan_dev, NULL); + + hmap_remove(&vlans_by_trunk, &vlan->trunk_node); + svec_destroy(&vlan->tagged_devs); + free(vlan->trunk_dev); + free(vlan->vlan_dev); + free(vlan); + return true; + } + return false; +} + +/* Returns a hash value for (trunk_dev,vid). */ +static uint32_t +hash_vlan(const char *trunk_dev, uint32_t vid) +{ + return hash_int(vid, hash_string(trunk_dev, 0)); +} + +/* Update /proc/net/vlan/<vlan_dev> for 'vlan'. */ +static void +set_vlan_proc_file(const struct compat_vlan *vlan) +{ + struct ds ds; + + ds_init(&ds); + ds_put_format( + &ds, + "%s VID: %d\t REORDER_HDR: 1 dev->priv_flags: 81\n" + " total frames received 0\n" + " total bytes received 0\n" + " Broadcast/Multicast Rcvd 0\n" + "\n" + " total frames transmitted 0\n" + " total bytes transmitted 0\n" + " total headroom inc 0\n" + " total encap on xmit 0\n" + "Device: %s\n" + "INGRESS priority mappings: 0:0 1:0 2:0 3:0 4:0 5:0 6:0 7:0\n" + "EGRESSS priority Mappings: \n", + vlan->vlan_dev, vlan->vid, vlan->trunk_dev); + set_proc_file("net/vlan", vlan->vlan_dev, ds_cstr(&ds)); + ds_destroy(&ds); +} + +/* Update /proc/net/vlan/config. */ +static void +update_vlan_config(void) +{ + struct compat_vlan *vlan; + struct ds ds; + + ds_init(&ds); + ds_put_cstr(&ds, "VLAN Dev name | VLAN ID\n" + "Name-Type: VLAN_NAME_TYPE_RAW_PLUS_VID_NO_PAD\n"); + HMAP_FOR_EACH (vlan, struct compat_vlan, trunk_node, &vlans_by_trunk) { + ds_put_format(&ds, "%-15s| %d | %s\n", + vlan->vlan_dev, vlan->vid, vlan->trunk_dev); + } + set_proc_file("net/vlan", "config", ds_cstr(&ds)); + ds_destroy(&ds); +} diff --git a/vswitchd/proc-net-compat.h b/vswitchd/proc-net-compat.h new file mode 100644 index 00000000..ce97176b --- /dev/null +++ b/vswitchd/proc-net-compat.h @@ -0,0 +1,51 @@ +/* Copyright (c) 2009 Nicira Networks + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * In addition, as a special exception, Nicira Networks gives permission + * to link the code of its release of vswitchd with the OpenSSL project's + * "OpenSSL" library (or with modified versions of it that use the same + * license as the "OpenSSL" library), and distribute the linked + * executables. You must obey the GNU General Public License in all + * respects for all of the code used other than "OpenSSL". If you modify + * this file, you may extend this exception to your version of the file, + * but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. + */ + +#ifndef VSWITCHD_PROC_NET_COMPAT_H +#define VSWITCHD_PROC_NET_COMPAT_H 1 + +#include "packets.h" + +struct compat_bond { + bool up; + int updelay; + int downdelay; + int n_slaves; + struct compat_bond_slave *slaves; +}; + +struct compat_bond_slave { + const char *name; + bool up; + uint8_t mac[ETH_ADDR_LEN]; +}; + +int proc_net_compat_init(void); +void proc_net_compat_update_bond(const char *name, const struct compat_bond *); +void proc_net_compat_update_vlan(const char *dev, const char *vlandev, + int vlan); + +#endif /* vswitchd/proc-net-compat.h */ diff --git a/vswitchd/xenserver.c b/vswitchd/xenserver.c new file mode 100644 index 00000000..7a8d255f --- /dev/null +++ b/vswitchd/xenserver.c @@ -0,0 +1,90 @@ +/* Copyright (c) 2009 Nicira Networks + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * In addition, as a special exception, Nicira Networks gives permission + * to link the code of its release of vswitchd with the OpenSSL project's + * "OpenSSL" library (or with modified versions of it that use the same + * license as the "OpenSSL" library), and distribute the linked + * executables. You must obey the GNU General Public License in all + * respects for all of the code used other than "OpenSSL". If you modify + * this file, you may extend this exception to your version of the file, + * but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. + */ + +#include <config.h> +#include "xenserver.h" +#include <ctype.h> +#include <errno.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include "dynamic-string.h" +#include "process.h" + +#include "vlog.h" +#define THIS_MODULE VLM_xenserver + +static char * +read_host_uuid(void) +{ + static const char filename[] = "/etc/xensource-inventory"; + char line[128]; + FILE *file; + + file = fopen(filename, "r"); + if (!file) { + if (errno == ENOENT) { + VLOG_INFO("not running on a XenServer"); + } else { + VLOG_INFO("%s: open: %s", filename, strerror(errno)); + } + return NULL; + } + + while (fgets(line, sizeof line, file)) { + static const char leader[] = "INSTALLATION_UUID='"; + const int leader_len = strlen(leader); + const int uuid_len = 36; + static const char trailer[] = "'\n"; + const int trailer_len = strlen(trailer); + + if (strlen(line) == leader_len + uuid_len + trailer_len + && !memcmp(line, leader, leader_len) + && !memcmp(line + leader_len + uuid_len, trailer, trailer_len)) { + char *host_uuid = xmemdup0(line + leader_len, uuid_len); + VLOG_INFO("running on XenServer, host-uuid %s", host_uuid); + fclose(file); + return host_uuid; + } + } + fclose(file); + VLOG_ERR("%s: INSTALLATION_UUID not found", filename); + return NULL; +} + +const char * +xenserver_get_host_uuid(void) +{ + static char *host_uuid; + static bool inited; + + if (!inited) { + host_uuid = read_host_uuid(); + inited = true; + } + return host_uuid; +} + diff --git a/vswitchd/xenserver.h b/vswitchd/xenserver.h new file mode 100644 index 00000000..c69b133a --- /dev/null +++ b/vswitchd/xenserver.h @@ -0,0 +1,32 @@ +/* Copyright (c) 2009 Nicira Networks + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * In addition, as a special exception, Nicira Networks gives permission + * to link the code of its release of vswitchd with the OpenSSL project's + * "OpenSSL" library (or with modified versions of it that use the same + * license as the "OpenSSL" library), and distribute the linked + * executables. You must obey the GNU General Public License in all + * respects for all of the code used other than "OpenSSL". If you modify + * this file, you may extend this exception to your version of the file, + * but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. + */ + +#ifndef VSWITCHD_XENSERVER_H +#define VSWITCHD_XENSERVER_H 1 + +const char *xenserver_get_host_uuid(void); + +#endif /* xenserver.h */ |