aboutsummaryrefslogtreecommitdiff
path: root/vswitchd
diff options
context:
space:
mode:
authorBen Pfaff <blp@nicira.com>2009-07-08 13:19:16 -0700
committerBen Pfaff <blp@nicira.com>2009-07-08 13:19:16 -0700
commit064af42167bf4fc9aaea2702d80ce08074b889c0 (patch)
treeefd15a6dc2402eeec273bb34db3b2445687589e5 /vswitchd
Import from old repository commit 61ef2b42a9c4ba8e1600f15bb0236765edc2ad45.v0.90.0
Diffstat (limited to 'vswitchd')
-rw-r--r--vswitchd/.gitignore7
-rw-r--r--vswitchd/automake.mk40
-rw-r--r--vswitchd/bridge.c3058
-rw-r--r--vswitchd/bridge.h43
-rw-r--r--vswitchd/mgmt.c679
-rw-r--r--vswitchd/mgmt.h36
-rw-r--r--vswitchd/ovs-brcompatd.8.in49
-rw-r--r--vswitchd/ovs-brcompatd.c766
-rw-r--r--vswitchd/ovs-vswitchd.8.in87
-rw-r--r--vswitchd/ovs-vswitchd.c255
-rw-r--r--vswitchd/ovs-vswitchd.conf.5.in642
-rw-r--r--vswitchd/ovs-vswitchd.h32
-rw-r--r--vswitchd/port.c68
-rw-r--r--vswitchd/port.h33
-rw-r--r--vswitchd/proc-net-compat.c344
-rw-r--r--vswitchd/proc-net-compat.h51
-rw-r--r--vswitchd/xenserver.c90
-rw-r--r--vswitchd/xenserver.h32
18 files changed, 6312 insertions, 0 deletions
diff --git a/vswitchd/.gitignore b/vswitchd/.gitignore
new file mode 100644
index 00000000..01d57ae7
--- /dev/null
+++ b/vswitchd/.gitignore
@@ -0,0 +1,7 @@
+/Makefile
+/Makefile.in
+/ovs-brcompatd
+/ovs-brcompatd.8
+/ovs-vswitchd
+/ovs-vswitchd.8
+/ovs-vswitchd.conf.5
diff --git a/vswitchd/automake.mk b/vswitchd/automake.mk
new file mode 100644
index 00000000..6883731e
--- /dev/null
+++ b/vswitchd/automake.mk
@@ -0,0 +1,40 @@
+sbin_PROGRAMS += vswitchd/ovs-vswitchd vswitchd/ovs-brcompatd
+man_MANS += \
+ vswitchd/ovs-vswitchd.conf.5 \
+ vswitchd/ovs-vswitchd.8 \
+ vswitchd/ovs-brcompatd.8
+DISTCLEANFILES += \
+ vswitchd/ovs-vswitchd.conf.5 \
+ vswitchd/ovs-vswitchd.8 \
+ vswitchd/ovs-brcompatd.8
+
+vswitchd_ovs_vswitchd_SOURCES = \
+ vswitchd/bridge.c \
+ vswitchd/bridge.h \
+ vswitchd/mgmt.c \
+ vswitchd/mgmt.h \
+ vswitchd/port.c \
+ vswitchd/port.h \
+ vswitchd/proc-net-compat.c \
+ vswitchd/proc-net-compat.h \
+ vswitchd/ovs-vswitchd.c \
+ vswitchd/ovs-vswitchd.h \
+ vswitchd/xenserver.c \
+ vswitchd/xenserver.h
+vswitchd_ovs_vswitchd_LDADD = \
+ secchan/libsecchan.a \
+ lib/libopenvswitch.a \
+ $(FAULT_LIBS) \
+ $(SSL_LIBS)
+
+vswitchd_ovs_brcompatd_SOURCES = \
+ vswitchd/ovs-brcompatd.c
+
+vswitchd_ovs_brcompatd_LDADD = \
+ lib/libopenvswitch.a \
+ $(FAULT_LIBS)
+
+EXTRA_DIST += \
+ vswitchd/ovs-vswitchd.conf.5.in \
+ vswitchd/ovs-vswitchd.8.in \
+ vswitchd/ovs-brcompatd.8.in
diff --git a/vswitchd/bridge.c b/vswitchd/bridge.c
new file mode 100644
index 00000000..cfd4dcf7
--- /dev/null
+++ b/vswitchd/bridge.c
@@ -0,0 +1,3058 @@
+/* Copyright (c) 2008, 2009 Nicira Networks
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * In addition, as a special exception, Nicira Networks gives permission
+ * to link the code of its release of vswitchd with the OpenSSL project's
+ * "OpenSSL" library (or with modified versions of it that use the same
+ * license as the "OpenSSL" library), and distribute the linked
+ * executables. You must obey the GNU General Public License in all
+ * respects for all of the code used other than "OpenSSL". If you modify
+ * this file, you may extend this exception to your version of the file,
+ * but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version.
+ *
+ */
+
+#include <config.h>
+#include "bridge.h"
+#include <assert.h>
+#include <errno.h>
+#include <arpa/inet.h>
+#include <ctype.h>
+#include <inttypes.h>
+#include <net/if.h>
+#include <openflow/openflow.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <unistd.h>
+#include "bitmap.h"
+#include "cfg.h"
+#include "coverage.h"
+#include "dirs.h"
+#include "dpif.h"
+#include "dynamic-string.h"
+#include "flow.h"
+#include "hash.h"
+#include "list.h"
+#include "mac-learning.h"
+#include "netdev.h"
+#include "odp-util.h"
+#include "ofp-print.h"
+#include "ofpbuf.h"
+#include "poll-loop.h"
+#include "port-array.h"
+#include "proc-net-compat.h"
+#include "process.h"
+#include "secchan/ofproto.h"
+#include "socket-util.h"
+#include "stp.h"
+#include "svec.h"
+#include "timeval.h"
+#include "util.h"
+#include "vconn.h"
+#include "vconn-ssl.h"
+#include "xenserver.h"
+#include "xtoxll.h"
+
+#define THIS_MODULE VLM_bridge
+#include "vlog.h"
+
+struct dst {
+ uint16_t vlan;
+ uint16_t dp_ifidx;
+};
+
+extern uint64_t mgmt_id;
+
+struct iface {
+ struct port *port; /* Containing port. */
+ size_t port_ifidx; /* Index within containing port. */
+
+ char *name; /* Host network device name. */
+ int dp_ifidx; /* Index within kernel datapath. */
+
+ uint8_t mac[ETH_ADDR_LEN]; /* Ethernet address (all zeros if unknowns). */
+
+ tag_type tag; /* Tag associated with this interface. */
+ bool enabled; /* May be chosen for flows? */
+ long long delay_expires; /* Time after which 'enabled' may change. */
+};
+
+#define BOND_MASK 0xff
+struct bond_entry {
+ int iface_idx; /* Index of assigned iface, or -1 if none. */
+ uint64_t tx_bytes; /* Count of bytes recently transmitted. */
+ tag_type iface_tag; /* Tag associated with iface_idx. */
+};
+
+#define MAX_MIRRORS 32
+typedef uint32_t mirror_mask_t;
+#define MIRROR_MASK_C(X) UINT32_C(X)
+BUILD_ASSERT_DECL(sizeof(mirror_mask_t) * CHAR_BIT >= MAX_MIRRORS);
+struct mirror {
+ struct bridge *bridge;
+ size_t idx;
+ char *name;
+
+ /* Selection criteria. */
+ struct svec src_ports;
+ struct svec dst_ports;
+ int *vlans;
+ size_t n_vlans;
+
+ /* Output. */
+ struct port *out_port;
+ int out_vlan;
+};
+
+#define FLOOD_PORT ((struct port *) 1) /* The 'flood' output port. */
+struct port {
+ struct bridge *bridge;
+ size_t port_idx;
+ int vlan; /* -1=trunk port, else a 12-bit VLAN ID. */
+ unsigned long *trunks; /* Bitmap of trunked VLANs, if 'vlan' == -1. */
+ char *name;
+
+ /* An ordinary bridge port has 1 interface.
+ * A bridge port for bonding has at least 2 interfaces. */
+ struct iface **ifaces;
+ size_t n_ifaces, allocated_ifaces;
+
+ /* Bonding info. */
+ struct bond_entry *bond_hash; /* An array of (BOND_MASK + 1) elements. */
+ int active_iface; /* Ifidx on which bcasts accepted, or -1. */
+ tag_type active_iface_tag; /* Tag for bcast flows. */
+ tag_type no_ifaces_tag; /* Tag for flows when all ifaces disabled. */
+ int updelay, downdelay; /* Delay before iface goes up/down, in ms. */
+
+ /* Port mirroring info. */
+ mirror_mask_t src_mirrors; /* Mirrors triggered when packet received. */
+ mirror_mask_t dst_mirrors; /* Mirrors triggered when packet sent. */
+ bool is_mirror_output_port; /* Does port mirroring send frames here? */
+
+ /* Spanning tree info. */
+ enum stp_state stp_state; /* Always STP_FORWARDING if STP not in use. */
+ tag_type stp_state_tag; /* Tag for STP state change. */
+};
+
+#define DP_MAX_PORTS 255
+struct bridge {
+ struct list node; /* Node in global list of bridges. */
+ char *name; /* User-specified arbitrary name. */
+ struct mac_learning *ml; /* MAC learning table, or null not to learn. */
+ bool sent_config_request; /* Successfully sent config request? */
+ uint8_t default_ea[ETH_ADDR_LEN]; /* Default MAC. */
+
+ /* Support for remote controllers. */
+ char *controller; /* NULL if there is no remote controller;
+ * "discover" to do controller discovery;
+ * otherwise a vconn name. */
+
+ /* OpenFlow switch processing. */
+ struct ofproto *ofproto; /* OpenFlow switch. */
+
+ /* Kernel datapath information. */
+ struct dpif dpif; /* Kernel datapath. */
+ struct port_array ifaces; /* Indexed by kernel datapath port number. */
+
+ /* Bridge ports. */
+ struct port **ports;
+ size_t n_ports, allocated_ports;
+
+ /* Bonding. */
+ bool has_bonded_ports;
+ long long int bond_next_rebalance;
+
+ /* Flow tracking. */
+ bool flush;
+
+ /* Flow statistics gathering. */
+ time_t next_stats_request;
+
+ /* Port mirroring. */
+ struct mirror *mirrors[MAX_MIRRORS];
+
+ /* Spanning tree. */
+ struct stp *stp;
+ long long int stp_last_tick;
+};
+
+/* List of all bridges. */
+static struct list all_bridges = LIST_INITIALIZER(&all_bridges);
+
+/* Maximum number of datapaths. */
+enum { DP_MAX = 256 };
+
+static struct bridge *bridge_create(const char *name);
+static void bridge_destroy(struct bridge *);
+static struct bridge *bridge_lookup(const char *name);
+static int bridge_run_one(struct bridge *);
+static void bridge_reconfigure_one(struct bridge *);
+static void bridge_reconfigure_controller(struct bridge *);
+static void bridge_get_all_ifaces(const struct bridge *, struct svec *ifaces);
+static void bridge_fetch_dp_ifaces(struct bridge *);
+static void bridge_flush(struct bridge *);
+static void bridge_pick_local_hw_addr(struct bridge *,
+ uint8_t ea[ETH_ADDR_LEN],
+ const char **devname);
+static uint64_t bridge_pick_datapath_id(struct bridge *,
+ const uint8_t bridge_ea[ETH_ADDR_LEN],
+ const char *devname);
+static uint64_t dpid_from_hash(const void *, size_t nbytes);
+
+static void bond_run(struct bridge *);
+static void bond_wait(struct bridge *);
+static void bond_rebalance_port(struct port *);
+
+static void port_create(struct bridge *, const char *name);
+static void port_reconfigure(struct port *);
+static void port_destroy(struct port *);
+static struct port *port_lookup(const struct bridge *, const char *name);
+static struct port *port_from_dp_ifidx(const struct bridge *,
+ uint16_t dp_ifidx);
+static void port_update_bond_compat(struct port *);
+static void port_update_vlan_compat(struct port *);
+
+static void mirror_create(struct bridge *, const char *name);
+static void mirror_destroy(struct mirror *);
+static void mirror_reconfigure(struct bridge *);
+static void mirror_reconfigure_one(struct mirror *);
+static bool vlan_is_mirrored(const struct mirror *, int vlan);
+
+static void brstp_reconfigure(struct bridge *);
+static void brstp_adjust_timers(struct bridge *);
+static void brstp_run(struct bridge *);
+static void brstp_wait(struct bridge *);
+
+static void iface_create(struct port *, const char *name);
+static void iface_destroy(struct iface *);
+static struct iface *iface_lookup(const struct bridge *, const char *name);
+static struct iface *iface_from_dp_ifidx(const struct bridge *,
+ uint16_t dp_ifidx);
+
+/* Hooks into ofproto processing. */
+static struct ofhooks bridge_ofhooks;
+
+/* Public functions. */
+
+/* Adds the name of each interface used by a bridge, including local and
+ * internal ports, to 'svec'. */
+void
+bridge_get_ifaces(struct svec *svec)
+{
+ struct bridge *br, *next;
+ size_t i, j;
+
+ LIST_FOR_EACH_SAFE (br, next, struct bridge, node, &all_bridges) {
+ for (i = 0; i < br->n_ports; i++) {
+ struct port *port = br->ports[i];
+
+ for (j = 0; j < port->n_ifaces; j++) {
+ struct iface *iface = port->ifaces[j];
+ if (iface->dp_ifidx < 0) {
+ VLOG_ERR("%s interface not in dp%u, ignoring",
+ iface->name, dpif_id(&br->dpif));
+ } else {
+ if (iface->dp_ifidx != ODPP_LOCAL) {
+ svec_add(svec, iface->name);
+ }
+ }
+ }
+ }
+ }
+}
+
+/* The caller must already have called cfg_read(). */
+void
+bridge_init(void)
+{
+ int retval;
+ int i;
+
+ for (i = 0; i < DP_MAX; i++) {
+ struct dpif dpif;
+ char devname[16];
+
+ sprintf(devname, "dp%d", i);
+ retval = dpif_open(devname, &dpif);
+ if (!retval) {
+ char dpif_name[IF_NAMESIZE];
+ if (dpif_get_name(&dpif, dpif_name, sizeof dpif_name)
+ || !cfg_has("bridge.%s.port", dpif_name)) {
+ dpif_delete(&dpif);
+ }
+ dpif_close(&dpif);
+ } else if (retval != ENODEV) {
+ VLOG_ERR("failed to delete datapath dp%d: %s",
+ i, strerror(retval));
+ }
+ }
+
+ bridge_reconfigure();
+}
+
+#ifdef HAVE_OPENSSL
+static bool
+config_string_change(const char *key, char **valuep)
+{
+ const char *value = cfg_get_string(0, "%s", key);
+ if (value && (!*valuep || strcmp(value, *valuep))) {
+ free(*valuep);
+ *valuep = xstrdup(value);
+ return true;
+ } else {
+ return false;
+ }
+}
+
+static void
+bridge_configure_ssl(void)
+{
+ /* XXX SSL should be configurable on a per-bridge basis.
+ * XXX should be possible to de-configure SSL. */
+ static char *private_key_file;
+ static char *certificate_file;
+ static char *cacert_file;
+
+ if (config_string_change("ssl.private-key", &private_key_file)) {
+ vconn_ssl_set_private_key_file(private_key_file);
+ }
+
+ if (config_string_change("ssl.certificate", &certificate_file)) {
+ vconn_ssl_set_certificate_file(certificate_file);
+ }
+
+ if (config_string_change("ssl.ca-cert", &cacert_file)) {
+ vconn_ssl_set_ca_cert_file(cacert_file,
+ cfg_get_bool(0, "ssl.bootstrap-ca-cert"));
+ }
+}
+#endif
+
+void
+bridge_reconfigure(void)
+{
+ struct svec old_br, new_br, raw_new_br;
+ struct bridge *br, *next;
+ size_t i, j;
+
+ COVERAGE_INC(bridge_reconfigure);
+
+ /* Collect old bridges. */
+ svec_init(&old_br);
+ LIST_FOR_EACH (br, struct bridge, node, &all_bridges) {
+ svec_add(&old_br, br->name);
+ }
+
+ /* Collect new bridges. */
+ svec_init(&raw_new_br);
+ cfg_get_subsections(&raw_new_br, "bridge");
+ svec_init(&new_br);
+ for (i = 0; i < raw_new_br.n; i++) {
+ const char *name = raw_new_br.names[i];
+ if ((!strncmp(name, "dp", 2) && isdigit(name[2])) ||
+ (!strncmp(name, "nl:", 3) && isdigit(name[3]))) {
+ VLOG_ERR("%s is not a valid bridge name (bridges may not be "
+ "named \"dp\" or \"nl:\" followed by a digit)", name);
+ } else {
+ svec_add(&new_br, name);
+ }
+ }
+ svec_destroy(&raw_new_br);
+
+ /* Get rid of deleted bridges and add new bridges. */
+ svec_sort(&old_br);
+ svec_sort(&new_br);
+ assert(svec_is_unique(&old_br));
+ assert(svec_is_unique(&new_br));
+ LIST_FOR_EACH_SAFE (br, next, struct bridge, node, &all_bridges) {
+ if (!svec_contains(&new_br, br->name)) {
+ bridge_destroy(br);
+ }
+ }
+ for (i = 0; i < new_br.n; i++) {
+ const char *name = new_br.names[i];
+ if (!svec_contains(&old_br, name)) {
+ bridge_create(name);
+ }
+ }
+ svec_destroy(&old_br);
+ svec_destroy(&new_br);
+
+#ifdef HAVE_OPENSSL
+ /* Configure SSL. */
+ bridge_configure_ssl();
+#endif
+
+ /* Reconfigure all bridges. */
+ LIST_FOR_EACH (br, struct bridge, node, &all_bridges) {
+ bridge_reconfigure_one(br);
+ }
+
+ /* Add and delete ports on all datapaths.
+ *
+ * The kernel will reject any attempt to add a given port to a datapath if
+ * that port already belongs to a different datapath, so we must do all
+ * port deletions before any port additions. */
+ LIST_FOR_EACH (br, struct bridge, node, &all_bridges) {
+ struct odp_port *dpif_ports;
+ size_t n_dpif_ports;
+ struct svec want_ifaces;
+
+ dpif_port_list(&br->dpif, &dpif_ports, &n_dpif_ports);
+ bridge_get_all_ifaces(br, &want_ifaces);
+ for (i = 0; i < n_dpif_ports; i++) {
+ const struct odp_port *p = &dpif_ports[i];
+ if (!svec_contains(&want_ifaces, p->devname)
+ && strcmp(p->devname, br->name)) {
+ int retval = dpif_port_del(&br->dpif, p->port);
+ if (retval) {
+ VLOG_ERR("failed to remove %s interface from dp%u: %s",
+ p->devname, dpif_id(&br->dpif), strerror(retval));
+ }
+ }
+ }
+ svec_destroy(&want_ifaces);
+ free(dpif_ports);
+ }
+ LIST_FOR_EACH (br, struct bridge, node, &all_bridges) {
+ struct odp_port *dpif_ports;
+ size_t n_dpif_ports;
+ struct svec cur_ifaces, want_ifaces, add_ifaces;
+ int next_port_no;
+
+ dpif_port_list(&br->dpif, &dpif_ports, &n_dpif_ports);
+ svec_init(&cur_ifaces);
+ for (i = 0; i < n_dpif_ports; i++) {
+ svec_add(&cur_ifaces, dpif_ports[i].devname);
+ }
+ free(dpif_ports);
+ svec_sort_unique(&cur_ifaces);
+ bridge_get_all_ifaces(br, &want_ifaces);
+ svec_diff(&want_ifaces, &cur_ifaces, &add_ifaces, NULL, NULL);
+
+ next_port_no = 1;
+ for (i = 0; i < add_ifaces.n; i++) {
+ const char *if_name = add_ifaces.names[i];
+ for (;;) {
+ int internal = cfg_get_bool(0, "iface.%s.internal", if_name);
+ int error = dpif_port_add(&br->dpif, if_name, next_port_no++,
+ internal ? ODP_PORT_INTERNAL : 0);
+ if (error != EEXIST) {
+ if (next_port_no >= 256) {
+ VLOG_ERR("ran out of valid port numbers on dp%u",
+ dpif_id(&br->dpif));
+ goto out;
+ }
+ if (error) {
+ VLOG_ERR("failed to add %s interface to dp%u: %s",
+ if_name, dpif_id(&br->dpif), strerror(error));
+ }
+ break;
+ }
+ }
+ }
+ out:
+ svec_destroy(&cur_ifaces);
+ svec_destroy(&want_ifaces);
+ svec_destroy(&add_ifaces);
+ }
+ LIST_FOR_EACH (br, struct bridge, node, &all_bridges) {
+ uint8_t ea[8];
+ uint64_t dpid;
+ struct iface *local_iface = NULL;
+ const char *devname;
+ uint8_t engine_type = br->dpif.minor;
+ uint8_t engine_id = br->dpif.minor;
+ bool add_id_to_iface = false;
+ struct svec nf_hosts;
+
+
+ bridge_fetch_dp_ifaces(br);
+ for (i = 0; i < br->n_ports; ) {
+ struct port *port = br->ports[i];
+
+ for (j = 0; j < port->n_ifaces; ) {
+ struct iface *iface = port->ifaces[j];
+ if (iface->dp_ifidx < 0) {
+ VLOG_ERR("%s interface not in dp%u, dropping",
+ iface->name, dpif_id(&br->dpif));
+ iface_destroy(iface);
+ } else {
+ if (iface->dp_ifidx == ODPP_LOCAL) {
+ local_iface = iface;
+ }
+ VLOG_DBG("dp%u has interface %s on port %d",
+ dpif_id(&br->dpif), iface->name, iface->dp_ifidx);
+ j++;
+ }
+ }
+ if (!port->n_ifaces) {
+ VLOG_ERR("%s port has no interfaces, dropping", port->name);
+ port_destroy(port);
+ continue;
+ }
+ i++;
+ }
+
+ /* Pick local port hardware address, datapath ID. */
+ bridge_pick_local_hw_addr(br, ea, &devname);
+ if (local_iface) {
+ int error = netdev_nodev_set_etheraddr(local_iface->name, ea);
+ if (error) {
+ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
+ VLOG_ERR_RL(&rl, "bridge %s: failed to set bridge "
+ "Ethernet address: %s",
+ br->name, strerror(error));
+ }
+ }
+
+ dpid = bridge_pick_datapath_id(br, ea, devname);
+ ofproto_set_datapath_id(br->ofproto, dpid);
+
+ /* Set NetFlow configuration on this bridge. */
+ if (cfg_has("netflow.%s.engine-type", br->name)) {
+ engine_type = cfg_get_int(0, "netflow.%s.engine-type",
+ br->name);
+ }
+ if (cfg_has("netflow.%s.engine-id", br->name)) {
+ engine_id = cfg_get_int(0, "netflow.%s.engine-id", br->name);
+ }
+ if (cfg_has("netflow.%s.add-id-to-iface", br->name)) {
+ add_id_to_iface = cfg_get_bool(0, "netflow.%s.add-id-to-iface",
+ br->name);
+ }
+ if (add_id_to_iface && engine_id > 0x7f) {
+ VLOG_WARN("bridge %s: netflow port mangling may conflict with "
+ "another vswitch, choose an engine id less than 128",
+ br->name);
+ }
+ if (add_id_to_iface && br->n_ports > 0x1ff) {
+ VLOG_WARN("bridge %s: netflow port mangling will conflict with "
+ "another port when 512 or more ports are used",
+ br->name);
+ }
+ svec_init(&nf_hosts);
+ cfg_get_all_keys(&nf_hosts, "netflow.%s.host", br->name);
+ if (ofproto_set_netflow(br->ofproto, &nf_hosts, engine_type,
+ engine_id, add_id_to_iface)) {
+ VLOG_ERR("bridge %s: problem setting netflow collectors",
+ br->name);
+ }
+
+ /* Update the controller and related settings. It would be more
+ * straightforward to call this from bridge_reconfigure_one(), but we
+ * can't do it there for two reasons. First, and most importantly, at
+ * that point we don't know the dp_ifidx of any interfaces that have
+ * been added to the bridge (because we haven't actually added them to
+ * the datapath). Second, at that point we haven't set the datapath ID
+ * yet; when a controller is configured, resetting the datapath ID will
+ * immediately disconnect from the controller, so it's better to set
+ * the datapath ID before the controller. */
+ bridge_reconfigure_controller(br);
+ }
+ LIST_FOR_EACH (br, struct bridge, node, &all_bridges) {
+ for (i = 0; i < br->n_ports; i++) {
+ struct port *port = br->ports[i];
+ port_update_vlan_compat(port);
+ }
+ }
+ LIST_FOR_EACH (br, struct bridge, node, &all_bridges) {
+ brstp_reconfigure(br);
+ }
+}
+
+static void
+bridge_pick_local_hw_addr(struct bridge *br, uint8_t ea[ETH_ADDR_LEN],
+ const char **devname)
+{
+ uint64_t requested_ea;
+ size_t i, j;
+ int error;
+
+ *devname = NULL;
+
+ /* Did the user request a particular MAC? */
+ requested_ea = cfg_get_mac(0, "bridge.%s.mac", br->name);
+ if (requested_ea) {
+ eth_addr_from_uint64(requested_ea, ea);
+ if (eth_addr_is_multicast(ea)) {
+ VLOG_ERR("bridge %s: cannot set MAC address to multicast "
+ "address "ETH_ADDR_FMT, br->name, ETH_ADDR_ARGS(ea));
+ } else if (eth_addr_is_zero(ea)) {
+ VLOG_ERR("bridge %s: cannot set MAC address to zero", br->name);
+ } else {
+ return;
+ }
+ }
+
+ /* Otherwise choose the minimum MAC address among all of the interfaces.
+ * (Xen uses FE:FF:FF:FF:FF:FF for virtual interfaces so this will get the
+ * MAC of the physical interface in such an environment.) */
+ memset(ea, 0xff, sizeof ea);
+ for (i = 0; i < br->n_ports; i++) {
+ struct port *port = br->ports[i];
+ if (port->is_mirror_output_port) {
+ continue;
+ }
+ for (j = 0; j < port->n_ifaces; j++) {
+ struct iface *iface = port->ifaces[j];
+ uint8_t iface_ea[ETH_ADDR_LEN];
+ if (iface->dp_ifidx == ODPP_LOCAL
+ || cfg_get_bool(0, "iface.%s.internal", iface->name)) {
+ continue;
+ }
+ error = netdev_nodev_get_etheraddr(iface->name, iface_ea);
+ if (!error) {
+ if (!eth_addr_is_multicast(iface_ea) &&
+ !eth_addr_is_reserved(iface_ea) &&
+ !eth_addr_is_zero(iface_ea) &&
+ memcmp(iface_ea, ea, ETH_ADDR_LEN) < 0) {
+ memcpy(ea, iface_ea, ETH_ADDR_LEN);
+ *devname = iface->name;
+ }
+ } else {
+ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
+ VLOG_ERR_RL(&rl, "failed to obtain Ethernet address of %s: %s",
+ iface->name, strerror(error));
+ }
+ }
+ }
+ if (eth_addr_is_multicast(ea) || eth_addr_is_vif(ea)) {
+ memcpy(ea, br->default_ea, ETH_ADDR_LEN);
+ *devname = NULL;
+ VLOG_WARN("bridge %s: using default bridge Ethernet "
+ "address "ETH_ADDR_FMT, br->name, ETH_ADDR_ARGS(ea));
+ } else {
+ VLOG_DBG("bridge %s: using bridge Ethernet address "ETH_ADDR_FMT,
+ br->name, ETH_ADDR_ARGS(ea));
+ }
+}
+
+/* Choose and returns the datapath ID for bridge 'br' given that the bridge
+ * Ethernet address is 'bridge_ea'. If 'bridge_ea' is the Ethernet address of
+ * a network device, then that network device's name must be passed in as
+ * 'devname'; if 'bridge_ea' was derived some other way, then 'devname' must be
+ * passed in as a null pointer. */
+static uint64_t
+bridge_pick_datapath_id(struct bridge *br,
+ const uint8_t bridge_ea[ETH_ADDR_LEN],
+ const char *devname)
+{
+ /*
+ * The procedure for choosing a bridge MAC address will, in the most
+ * ordinary case, also choose a unique MAC that we can use as a datapath
+ * ID. In some special cases, though, multiple bridges will end up with
+ * the same MAC address. This is OK for the bridges, but it will confuse
+ * the OpenFlow controller, because each datapath needs a unique datapath
+ * ID.
+ *
+ * Datapath IDs must be unique. It is also very desirable that they be
+ * stable from one run to the next, so that policy set on a datapath
+ * "sticks".
+ */
+ uint64_t dpid;
+
+ dpid = cfg_get_dpid(0, "bridge.%s.datapath-id", br->name);
+ if (dpid) {
+ return dpid;
+ }
+
+ if (devname) {
+ int vlan;
+ if (!netdev_get_vlan_vid(devname, &vlan)) {
+ /*
+ * A bridge whose MAC address is taken from a VLAN network device
+ * (that is, a network device created with vconfig(8) or similar
+ * tool) will have the same MAC address as a bridge on the VLAN
+ * device's physical network device.
+ *
+ * Handle this case by hashing the physical network device MAC
+ * along with the VLAN identifier.
+ */
+ uint8_t buf[ETH_ADDR_LEN + 2];
+ memcpy(buf, bridge_ea, ETH_ADDR_LEN);
+ buf[ETH_ADDR_LEN] = vlan >> 8;
+ buf[ETH_ADDR_LEN + 1] = vlan;
+ return dpid_from_hash(buf, sizeof buf);
+ } else {
+ /*
+ * Assume that this bridge's MAC address is unique, since it
+ * doesn't fit any of the cases we handle specially.
+ */
+ }
+ } else {
+ /*
+ * A purely internal bridge, that is, one that has no non-virtual
+ * network devices on it at all, is more difficult because it has no
+ * natural unique identifier at all.
+ *
+ * When the host is a XenServer, we handle this case by hashing the
+ * host's UUID with the name of the bridge. Names of bridges are
+ * persistent across XenServer reboots, although they can be reused if
+ * an internal network is destroyed and then a new one is later
+ * created, so this is fairly effective.
+ *
+ * When the host is not a XenServer, we punt by using a random MAC
+ * address on each run.
+ */
+ const char *host_uuid = xenserver_get_host_uuid();
+ if (host_uuid) {
+ char *combined = xasprintf("%s,%s", host_uuid, br->name);
+ dpid = dpid_from_hash(combined, strlen(combined));
+ free(combined);
+ return dpid;
+ }
+ }
+
+ return eth_addr_to_uint64(bridge_ea);
+}
+
+static uint64_t
+dpid_from_hash(const void *data, size_t n)
+{
+ uint8_t hash[SHA1HashSize];
+
+ BUILD_ASSERT_DECL(sizeof hash >= ETH_ADDR_LEN);
+ SHA1Bytes(data, n, hash);
+ eth_addr_mark_random(hash);
+ return eth_addr_to_uint64(hash);
+}
+
+int
+bridge_run(void)
+{
+ struct bridge *br, *next;
+ int retval;
+
+ retval = 0;
+ LIST_FOR_EACH_SAFE (br, next, struct bridge, node, &all_bridges) {
+ int error = bridge_run_one(br);
+ if (error) {
+ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
+ VLOG_ERR_RL(&rl, "bridge %s: datapath was destroyed externally, "
+ "forcing reconfiguration", br->name);
+ if (!retval) {
+ retval = error;
+ }
+ }
+ }
+ return retval;
+}
+
+void
+bridge_wait(void)
+{
+ struct bridge *br;
+
+ LIST_FOR_EACH (br, struct bridge, node, &all_bridges) {
+ ofproto_wait(br->ofproto);
+ if (br->controller) {
+ continue;
+ }
+
+ if (br->ml) {
+ mac_learning_wait(br->ml);
+ }
+ bond_wait(br);
+ brstp_wait(br);
+ }
+}
+
+/* Forces 'br' to revalidate all of its flows. This is appropriate when 'br''s
+ * configuration changes. */
+static void
+bridge_flush(struct bridge *br)
+{
+ COVERAGE_INC(bridge_flush);
+ br->flush = true;
+ if (br->ml) {
+ mac_learning_flush(br->ml);
+ }
+}
+
+/* Bridge reconfiguration functions. */
+
+static struct bridge *
+bridge_create(const char *name)
+{
+ struct bridge *br;
+ int error;
+
+ assert(!bridge_lookup(name));
+ br = xcalloc(1, sizeof *br);
+
+ error = dpif_create(name, &br->dpif);
+ if (error == EEXIST) {
+ error = dpif_open(name, &br->dpif);
+ if (error) {
+ VLOG_ERR("datapath %s already exists but cannot be opened: %s",
+ name, strerror(error));
+ free(br);
+ return NULL;
+ }
+ dpif_flow_flush(&br->dpif);
+ } else if (error) {
+ VLOG_ERR("failed to create datapath %s: %s", name, strerror(error));
+ free(br);
+ return NULL;
+ }
+
+ error = ofproto_create(name, &bridge_ofhooks, br, &br->ofproto);
+ if (error) {
+ VLOG_ERR("failed to create switch %s: %s", name, strerror(error));
+ dpif_delete(&br->dpif);
+ dpif_close(&br->dpif);
+ free(br);
+ return NULL;
+ }
+
+ br->name = xstrdup(name);
+ br->ml = mac_learning_create();
+ br->sent_config_request = false;
+ eth_addr_random(br->default_ea);
+
+ port_array_init(&br->ifaces);
+
+ br->flush = false;
+ br->bond_next_rebalance = time_msec() + 10000;
+
+ list_push_back(&all_bridges, &br->node);
+
+ VLOG_INFO("created bridge %s on dp%u", br->name, dpif_id(&br->dpif));
+
+ return br;
+}
+
+static void
+bridge_destroy(struct bridge *br)
+{
+ if (br) {
+ int error;
+
+ while (br->n_ports > 0) {
+ port_destroy(br->ports[br->n_ports - 1]);
+ }
+ list_remove(&br->node);
+ error = dpif_delete(&br->dpif);
+ if (error && error != ENOENT) {
+ VLOG_ERR("failed to delete dp%u: %s",
+ dpif_id(&br->dpif), strerror(error));
+ }
+ dpif_close(&br->dpif);
+ ofproto_destroy(br->ofproto);
+ free(br->controller);
+ mac_learning_destroy(br->ml);
+ port_array_destroy(&br->ifaces);
+ free(br->ports);
+ free(br->name);
+ free(br);
+ }
+}
+
+static struct bridge *
+bridge_lookup(const char *name)
+{
+ struct bridge *br;
+
+ LIST_FOR_EACH (br, struct bridge, node, &all_bridges) {
+ if (!strcmp(br->name, name)) {
+ return br;
+ }
+ }
+ return NULL;
+}
+
+bool
+bridge_exists(const char *name)
+{
+ return bridge_lookup(name) ? true : false;
+}
+
+uint64_t
+bridge_get_datapathid(const char *name)
+{
+ struct bridge *br = bridge_lookup(name);
+ return br ? ofproto_get_datapath_id(br->ofproto) : 0;
+}
+
+static int
+bridge_run_one(struct bridge *br)
+{
+ int error;
+
+ error = ofproto_run1(br->ofproto);
+ if (error) {
+ return error;
+ }
+
+ if (br->ml) {
+ mac_learning_run(br->ml, ofproto_get_revalidate_set(br->ofproto));
+ }
+ bond_run(br);
+ brstp_run(br);
+
+ error = ofproto_run2(br->ofproto, br->flush);
+ br->flush = false;
+
+ return error;
+}
+
+static const char *
+bridge_get_controller(const struct bridge *br)
+{
+ const char *controller;
+
+ controller = cfg_get_string(0, "bridge.%s.controller", br->name);
+ if (!controller) {
+ controller = cfg_get_string(0, "mgmt.controller");
+ }
+ return controller && controller[0] ? controller : NULL;
+}
+
+static void
+bridge_reconfigure_one(struct bridge *br)
+{
+ struct svec old_ports, new_ports, ifaces;
+ struct svec listeners, old_listeners;
+ struct svec snoops, old_snoops;
+ size_t i, j;
+
+ /* Collect old ports. */
+ svec_init(&old_ports);
+ for (i = 0; i < br->n_ports; i++) {
+ svec_add(&old_ports, br->ports[i]->name);
+ }
+ svec_sort(&old_ports);
+ assert(svec_is_unique(&old_ports));
+
+ /* Collect new ports. */
+ svec_init(&new_ports);
+ cfg_get_all_keys(&new_ports, "bridge.%s.port", br->name);
+ svec_sort(&new_ports);
+ if (bridge_get_controller(br) && !svec_contains(&new_ports, br->name)) {
+ svec_add(&new_ports, br->name);
+ svec_sort(&new_ports);
+ }
+ if (!svec_is_unique(&new_ports)) {
+ VLOG_WARN("bridge %s: %s specified twice as bridge port",
+ br->name, svec_get_duplicate(&new_ports));
+ svec_unique(&new_ports);
+ }
+
+ ofproto_set_mgmt_id(br->ofproto, mgmt_id);
+
+ /* Get rid of deleted ports and add new ports. */
+ for (i = 0; i < br->n_ports; ) {
+ struct port *port = br->ports[i];
+ if (!svec_contains(&new_ports, port->name)) {
+ port_destroy(port);
+ } else {
+ i++;
+ }
+ }
+ for (i = 0; i < new_ports.n; i++) {
+ const char *name = new_ports.names[i];
+ if (!svec_contains(&old_ports, name)) {
+ port_create(br, name);
+ }
+ }
+ svec_destroy(&old_ports);
+ svec_destroy(&new_ports);
+
+ /* Reconfigure all ports. */
+ for (i = 0; i < br->n_ports; i++) {
+ port_reconfigure(br->ports[i]);
+ }
+
+ /* Check and delete duplicate interfaces. */
+ svec_init(&ifaces);
+ for (i = 0; i < br->n_ports; ) {
+ struct port *port = br->ports[i];
+ for (j = 0; j < port->n_ifaces; ) {
+ struct iface *iface = port->ifaces[j];
+ if (svec_contains(&ifaces, iface->name)) {
+ VLOG_ERR("bridge %s: %s interface is on multiple ports, "
+ "removing from %s",
+ br->name, iface->name, port->name);
+ iface_destroy(iface);
+ } else {
+ svec_add(&ifaces, iface->name);
+ svec_sort(&ifaces);
+ j++;
+ }
+ }
+ if (!port->n_ifaces) {
+ VLOG_ERR("%s port has no interfaces, dropping", port->name);
+ port_destroy(port);
+ } else {
+ i++;
+ }
+ }
+ svec_destroy(&ifaces);
+
+ /* Delete all flows if we're switching from connected to standalone or vice
+ * versa. (XXX Should we delete all flows if we are switching from one
+ * controller to another?) */
+
+ /* Configure OpenFlow management listeners. */
+ svec_init(&listeners);
+ cfg_get_all_strings(&listeners, "bridge.%s.openflow.listeners", br->name);
+ if (!listeners.n) {
+ svec_add_nocopy(&listeners, xasprintf("punix:%s/%s.mgmt",
+ ovs_rundir, br->name));
+ } else if (listeners.n == 1 && !strcmp(listeners.names[0], "none")) {
+ svec_clear(&listeners);
+ }
+ svec_sort_unique(&listeners);
+
+ svec_init(&old_listeners);
+ ofproto_get_listeners(br->ofproto, &old_listeners);
+ svec_sort_unique(&old_listeners);
+
+ if (!svec_equal(&listeners, &old_listeners)) {
+ ofproto_set_listeners(br->ofproto, &listeners);
+ }
+ svec_destroy(&listeners);
+ svec_destroy(&old_listeners);
+
+ /* Configure OpenFlow controller connection snooping. */
+ svec_init(&snoops);
+ cfg_get_all_strings(&snoops, "bridge.%s.openflow.snoops", br->name);
+ if (!snoops.n) {
+ svec_add_nocopy(&snoops, xasprintf("punix:%s/%s.snoop",
+ ovs_rundir, br->name));
+ } else if (snoops.n == 1 && !strcmp(snoops.names[0], "none")) {
+ svec_clear(&snoops);
+ }
+ svec_sort_unique(&snoops);
+
+ svec_init(&old_snoops);
+ ofproto_get_snoops(br->ofproto, &old_snoops);
+ svec_sort_unique(&old_snoops);
+
+ if (!svec_equal(&snoops, &old_snoops)) {
+ ofproto_set_snoops(br->ofproto, &snoops);
+ }
+ svec_destroy(&snoops);
+ svec_destroy(&old_snoops);
+
+ mirror_reconfigure(br);
+}
+
+static void
+bridge_reconfigure_controller(struct bridge *br)
+{
+ char *pfx = xasprintf("bridge.%s.controller", br->name);
+ const char *controller;
+
+ controller = bridge_get_controller(br);
+ if ((br->controller != NULL) != (controller != NULL)) {
+ ofproto_flush_flows(br->ofproto);
+ }
+ free(br->controller);
+ br->controller = controller ? xstrdup(controller) : NULL;
+
+ if (controller) {
+ const char *fail_mode;
+ int max_backoff, probe;
+ int rate_limit, burst_limit;
+
+ if (!strcmp(controller, "discover")) {
+ ofproto_set_discovery(br->ofproto, true,
+ cfg_get_string(0, "%s.accept-regex", pfx),
+ cfg_get_bool(0, "%s.update-resolv.conf",
+ pfx));
+ } else {
+ struct netdev *netdev;
+ bool in_band;
+ int error;
+
+ in_band = (!cfg_is_valid(CFG_BOOL | CFG_REQUIRED,
+ "%s.in-band", pfx)
+ || cfg_get_bool(0, "%s.in-band", pfx));
+ ofproto_set_discovery(br->ofproto, false, NULL, NULL);
+ ofproto_set_in_band(br->ofproto, in_band);
+
+ error = netdev_open(br->name, NETDEV_ETH_TYPE_NONE, &netdev);
+ if (!error) {
+ if (cfg_is_valid(CFG_IP | CFG_REQUIRED, "%s.ip", pfx)) {
+ struct in_addr ip, mask, gateway;
+ ip.s_addr = cfg_get_ip(0, "%s.ip", pfx);
+ mask.s_addr = cfg_get_ip(0, "%s.netmask", pfx);
+ gateway.s_addr = cfg_get_ip(0, "%s.gateway", pfx);
+
+ netdev_turn_flags_on(netdev, NETDEV_UP, true);
+ if (!mask.s_addr) {
+ mask.s_addr = guess_netmask(ip.s_addr);
+ }
+ if (!netdev_set_in4(netdev, ip, mask)) {
+ VLOG_INFO("bridge %s: configured IP address "IP_FMT", "
+ "netmask "IP_FMT,
+ br->name, IP_ARGS(&ip.s_addr),
+ IP_ARGS(&mask.s_addr));
+ }
+
+ if (gateway.s_addr) {
+ if (!netdev_add_router(gateway)) {
+ VLOG_INFO("bridge %s: configured gateway "IP_FMT,
+ br->name, IP_ARGS(&gateway.s_addr));
+ }
+ }
+ }
+ netdev_close(netdev);
+ }
+ }
+
+ fail_mode = cfg_get_string(0, "%s.fail-mode", pfx);
+ if (!fail_mode) {
+ fail_mode = cfg_get_string(0, "mgmt.fail-mode");
+ }
+ ofproto_set_failure(br->ofproto,
+ (!fail_mode
+ || !strcmp(fail_mode, "standalone")
+ || !strcmp(fail_mode, "open")));
+
+ probe = cfg_get_int(0, "%s.inactivity-probe", pfx);
+ ofproto_set_probe_interval(br->ofproto,
+ probe ? probe : cfg_get_int(0, "mgmt.inactivity-probe"));
+
+ max_backoff = cfg_get_int(0, "%s.max-backoff", pfx);
+ if (!max_backoff) {
+ max_backoff = cfg_get_int(0, "mgmt.max-backoff");
+ if (!max_backoff) {
+ max_backoff = 15;
+ }
+ }
+ ofproto_set_max_backoff(br->ofproto, max_backoff);
+
+ rate_limit = cfg_get_int(0, "%s.rate-limit", pfx);
+ if (!rate_limit) {
+ rate_limit = cfg_get_int(0, "mgmt.rate-limit");
+ }
+ burst_limit = cfg_get_int(0, "%s.burst-limit", pfx);
+ if (!burst_limit) {
+ burst_limit = cfg_get_int(0, "mgmt.burst-limit");
+ }
+ ofproto_set_rate_limit(br->ofproto, rate_limit, burst_limit);
+
+ ofproto_set_stp(br->ofproto, cfg_get_bool(0, "%s.stp", pfx));
+
+ if (cfg_has("%s.commands.acl", pfx)) {
+ struct svec command_acls;
+ char *command_acl;
+
+ svec_init(&command_acls);
+ cfg_get_all_strings(&command_acls, "%s.commands.acl", pfx);
+ command_acl = svec_join(&command_acls, ",", "");
+
+ ofproto_set_remote_execution(br->ofproto, command_acl,
+ cfg_get_string(0, "%s.commands.dir",
+ pfx));
+
+ svec_destroy(&command_acls);
+ free(command_acl);
+ } else {
+ ofproto_set_remote_execution(br->ofproto, NULL, NULL);
+ }
+ } else {
+ union ofp_action action;
+ flow_t flow;
+
+ /* Set up a flow that matches every packet and directs them to
+ * OFPP_NORMAL (which goes to us). */
+ memset(&action, 0, sizeof action);
+ action.type = htons(OFPAT_OUTPUT);
+ action.output.len = htons(sizeof action);
+ action.output.port = htons(OFPP_NORMAL);
+ memset(&flow, 0, sizeof flow);
+ ofproto_add_flow(br->ofproto, &flow, OFPFW_ALL, 0,
+ &action, 1, 0);
+
+ ofproto_set_in_band(br->ofproto, false);
+ ofproto_set_max_backoff(br->ofproto, 1);
+ ofproto_set_probe_interval(br->ofproto, 5);
+ ofproto_set_failure(br->ofproto, false);
+ ofproto_set_stp(br->ofproto, false);
+ }
+ free(pfx);
+
+ ofproto_set_controller(br->ofproto, br->controller);
+}
+
+static void
+bridge_get_all_ifaces(const struct bridge *br, struct svec *ifaces)
+{
+ size_t i, j;
+
+ svec_init(ifaces);
+ for (i = 0; i < br->n_ports; i++) {
+ struct port *port = br->ports[i];
+ for (j = 0; j < port->n_ifaces; j++) {
+ struct iface *iface = port->ifaces[j];
+ svec_add(ifaces, iface->name);
+ }
+ }
+ svec_sort(ifaces);
+ assert(svec_is_unique(ifaces));
+}
+
+/* For robustness, in case the administrator moves around datapath ports behind
+ * our back, we re-check all the datapath port numbers here.
+ *
+ * This function will set the 'dp_ifidx' members of interfaces that have
+ * disappeared to -1, so only call this function from a context where those
+ * 'struct iface's will be removed from the bridge. Otherwise, the -1
+ * 'dp_ifidx'es will cause trouble later when we try to send them to the
+ * datapath, which doesn't support UINT16_MAX+1 ports. */
+static void
+bridge_fetch_dp_ifaces(struct bridge *br)
+{
+ struct odp_port *dpif_ports;
+ size_t n_dpif_ports;
+ size_t i, j;
+
+ /* Reset all interface numbers. */
+ for (i = 0; i < br->n_ports; i++) {
+ struct port *port = br->ports[i];
+ for (j = 0; j < port->n_ifaces; j++) {
+ struct iface *iface = port->ifaces[j];
+ iface->dp_ifidx = -1;
+ }
+ }
+ port_array_clear(&br->ifaces);
+
+ dpif_port_list(&br->dpif, &dpif_ports, &n_dpif_ports);
+ for (i = 0; i < n_dpif_ports; i++) {
+ struct odp_port *p = &dpif_ports[i];
+ struct iface *iface = iface_lookup(br, p->devname);
+ if (iface) {
+ if (iface->dp_ifidx >= 0) {
+ VLOG_WARN("dp%u reported interface %s twice",
+ dpif_id(&br->dpif), p->devname);
+ } else if (iface_from_dp_ifidx(br, p->port)) {
+ VLOG_WARN("dp%u reported interface %"PRIu16" twice",
+ dpif_id(&br->dpif), p->port);
+ } else {
+ port_array_set(&br->ifaces, p->port, iface);
+ iface->dp_ifidx = p->port;
+ }
+ }
+ }
+ free(dpif_ports);
+}
+
+/* Bridge packet processing functions. */
+
+static struct bond_entry *
+lookup_bond_entry(const struct port *port, const uint8_t mac[ETH_ADDR_LEN])
+{
+ size_t h = hash_bytes(mac, ETH_ADDR_LEN, 0);
+ return &port->bond_hash[h & BOND_MASK];
+}
+
+static int
+bond_choose_iface(const struct port *port)
+{
+ size_t i;
+ for (i = 0; i < port->n_ifaces; i++) {
+ if (port->ifaces[i]->enabled) {
+ return i;
+ }
+ }
+ return -1;
+}
+
+static bool
+choose_output_iface(const struct port *port, const flow_t *flow,
+ uint16_t *dp_ifidx, tag_type *tags)
+{
+ struct iface *iface;
+
+ assert(port->n_ifaces);
+ if (port->n_ifaces == 1) {
+ iface = port->ifaces[0];
+ } else {
+ struct bond_entry *e = lookup_bond_entry(port, flow->dl_src);
+ if (e->iface_idx < 0 || e->iface_idx >= port->n_ifaces
+ || !port->ifaces[e->iface_idx]->enabled) {
+ /* XXX select interface properly. The current interface selection
+ * is only good for testing the rebalancing code. */
+ e->iface_idx = bond_choose_iface(port);
+ if (e->iface_idx < 0) {
+ *tags |= port->no_ifaces_tag;
+ return false;
+ }
+ e->iface_tag = tag_create_random();
+ }
+ *tags |= e->iface_tag;
+ iface = port->ifaces[e->iface_idx];
+ }
+ *dp_ifidx = iface->dp_ifidx;
+ *tags |= iface->tag; /* Currently only used for bonding. */
+ return true;
+}
+
+static void
+bond_link_status_update(struct iface *iface, bool carrier)
+{
+ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
+ struct port *port = iface->port;
+
+ if ((carrier == iface->enabled) == (iface->delay_expires == LLONG_MAX)) {
+ /* Nothing to do. */
+ return;
+ }
+ VLOG_INFO_RL(&rl, "interface %s: carrier %s",
+ iface->name, carrier ? "detected" : "dropped");
+ if (carrier == iface->enabled) {
+ iface->delay_expires = LLONG_MAX;
+ VLOG_INFO_RL(&rl, "interface %s: will not be %s",
+ iface->name, carrier ? "disabled" : "enabled");
+ } else {
+ int delay = carrier ? port->updelay : port->downdelay;
+ iface->delay_expires = time_msec() + delay;
+ if (delay) {
+ VLOG_INFO_RL(&rl,
+ "interface %s: will be %s if it stays %s for %d ms",
+ iface->name,
+ carrier ? "enabled" : "disabled",
+ carrier ? "up" : "down",
+ delay);
+ }
+ }
+}
+
+static void
+bond_choose_active_iface(struct port *port)
+{
+ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
+
+ port->active_iface = bond_choose_iface(port);
+ port->active_iface_tag = tag_create_random();
+ if (port->active_iface >= 0) {
+ VLOG_INFO_RL(&rl, "port %s: active interface is now %s",
+ port->name, port->ifaces[port->active_iface]->name);
+ } else {
+ VLOG_WARN_RL(&rl, "port %s: all ports disabled, no active interface",
+ port->name);
+ }
+}
+
+static void
+bond_run(struct bridge *br)
+{
+ size_t i, j;
+
+ for (i = 0; i < br->n_ports; i++) {
+ struct port *port = br->ports[i];
+ if (port->n_ifaces < 2) {
+ continue;
+ }
+ for (j = 0; j < port->n_ifaces; j++) {
+ struct iface *iface = port->ifaces[j];
+ if (time_msec() >= iface->delay_expires) {
+ iface->delay_expires = LLONG_MAX;
+ iface->enabled = !iface->enabled;
+ VLOG_WARN("interface %s: %s",
+ iface->name,
+ iface->enabled ? "enabled" : "disabled");
+ if (!iface->enabled) {
+ ofproto_revalidate(br->ofproto, iface->tag);
+ if (iface->port_ifidx == port->active_iface) {
+ ofproto_revalidate(br->ofproto,
+ port->active_iface_tag);
+ bond_choose_active_iface(port);
+ }
+ } else {
+ if (port->active_iface < 0) {
+ ofproto_revalidate(br->ofproto, port->no_ifaces_tag);
+ bond_choose_active_iface(port);
+ }
+ iface->tag = tag_create_random();
+ }
+ }
+ }
+ }
+}
+
+static void
+bond_wait(struct bridge *br)
+{
+ size_t i, j;
+
+ for (i = 0; i < br->n_ports; i++) {
+ struct port *port = br->ports[i];
+ if (port->n_ifaces < 2) {
+ continue;
+ }
+ for (j = 0; j < port->n_ifaces; j++) {
+ struct iface *iface = port->ifaces[j];
+ if (iface->delay_expires != LLONG_MAX) {
+ poll_timer_wait(iface->delay_expires - time_msec());
+ }
+ }
+ }
+}
+
+static bool
+set_dst(struct dst *p, const flow_t *flow,
+ const struct port *in_port, const struct port *out_port,
+ tag_type *tags)
+{
+ /* STP handling.
+ *
+ * XXX This uses too many tags: any broadcast flow will get one tag per
+ * destination port, and thus a broadcast on a switch of any size is likely
+ * to have all tag bits set. We should figure out a way to be smarter.
+ *
+ * This is OK when STP is disabled, because stp_state_tag is 0 then. */
+ *tags |= out_port->stp_state_tag;
+ if (!(out_port->stp_state & (STP_DISABLED | STP_FORWARDING))) {
+ return false;
+ }
+
+ p->vlan = (out_port->vlan >= 0 ? OFP_VLAN_NONE
+ : in_port->vlan >= 0 ? in_port->vlan
+ : ntohs(flow->dl_vlan));
+ return choose_output_iface(out_port, flow, &p->dp_ifidx, tags);
+}
+
+static void
+swap_dst(struct dst *p, struct dst *q)
+{
+ struct dst tmp = *p;
+ *p = *q;
+ *q = tmp;
+}
+
+/* Moves all the dsts with vlan == 'vlan' to the front of the 'n_dsts' in
+ * 'dsts'. (This may help performance by reducing the number of VLAN changes
+ * that we push to the datapath. We could in fact fully sort the array by
+ * vlan, but in most cases there are at most two different vlan tags so that's
+ * possibly overkill.) */
+static void
+partition_dsts(struct dst *dsts, size_t n_dsts, int vlan)
+{
+ struct dst *first = dsts;
+ struct dst *last = dsts + n_dsts;
+
+ while (first != last) {
+ /* Invariants:
+ * - All dsts < first have vlan == 'vlan'.
+ * - All dsts >= last have vlan != 'vlan'.
+ * - first < last. */
+ while (first->vlan == vlan) {
+ if (++first == last) {
+ return;
+ }
+ }
+
+ /* Same invariants, plus one additional:
+ * - first->vlan != vlan.
+ */
+ while (last[-1].vlan != vlan) {
+ if (--last == first) {
+ return;
+ }
+ }
+
+ /* Same invariants, plus one additional:
+ * - last[-1].vlan == vlan.*/
+ swap_dst(first++, --last);
+ }
+}
+
+static int
+mirror_mask_ffs(mirror_mask_t mask)
+{
+ BUILD_ASSERT_DECL(sizeof(unsigned int) >= sizeof(mask));
+ return ffs(mask);
+}
+
+static bool
+dst_is_duplicate(const struct dst *dsts, size_t n_dsts,
+ const struct dst *test)
+{
+ size_t i;
+ for (i = 0; i < n_dsts; i++) {
+ if (dsts[i].vlan == test->vlan && dsts[i].dp_ifidx == test->dp_ifidx) {
+ return true;
+ }
+ }
+ return false;
+}
+
+static bool
+port_trunks_vlan(const struct port *port, uint16_t vlan)
+{
+ return port->vlan < 0 && bitmap_is_set(port->trunks, vlan);
+}
+
+static bool
+port_includes_vlan(const struct port *port, uint16_t vlan)
+{
+ return vlan == port->vlan || port_trunks_vlan(port, vlan);
+}
+
+static size_t
+compose_dsts(const struct bridge *br, const flow_t *flow, uint16_t vlan,
+ const struct port *in_port, const struct port *out_port,
+ struct dst dsts[], tag_type *tags)
+{
+ mirror_mask_t mirrors = in_port->src_mirrors;
+ struct dst *dst = dsts;
+ size_t i;
+
+ *tags |= in_port->stp_state_tag;
+ if (out_port == FLOOD_PORT) {
+ /* XXX use ODP_FLOOD if no vlans or bonding. */
+ /* XXX even better, define each VLAN as a datapath port group */
+ for (i = 0; i < br->n_ports; i++) {
+ struct port *port = br->ports[i];
+ if (port != in_port && port_includes_vlan(port, vlan)
+ && !port->is_mirror_output_port
+ && set_dst(dst, flow, in_port, port, tags)) {
+ mirrors |= port->dst_mirrors;
+ dst++;
+ }
+ }
+ } else if (out_port && set_dst(dst, flow, in_port, out_port, tags)) {
+ mirrors |= out_port->dst_mirrors;
+ dst++;
+ }
+
+ while (mirrors) {
+ struct mirror *m = br->mirrors[mirror_mask_ffs(mirrors) - 1];
+ if (!m->n_vlans || vlan_is_mirrored(m, vlan)) {
+ if (m->out_port) {
+ if (set_dst(dst, flow, in_port, m->out_port, tags)
+ && !dst_is_duplicate(dsts, dst - dsts, dst)) {
+ dst++;
+ }
+ } else {
+ for (i = 0; i < br->n_ports; i++) {
+ struct port *port = br->ports[i];
+ if (port_includes_vlan(port, m->out_vlan)
+ && set_dst(dst, flow, in_port, port, tags)
+ && !dst_is_duplicate(dsts, dst - dsts, dst))
+ {
+ if (port->vlan < 0) {
+ dst->vlan = m->out_vlan;
+ }
+ if (dst->dp_ifidx == flow->in_port
+ && dst->vlan == vlan) {
+ /* Don't send out input port on same VLAN. */
+ continue;
+ }
+ dst++;
+ }
+ }
+ }
+ }
+ mirrors &= mirrors - 1;
+ }
+
+ partition_dsts(dsts, dst - dsts, ntohs(flow->dl_vlan));
+ return dst - dsts;
+}
+
+static void UNUSED
+print_dsts(const struct dst *dsts, size_t n)
+{
+ for (; n--; dsts++) {
+ printf(">p%"PRIu16, dsts->dp_ifidx);
+ if (dsts->vlan != OFP_VLAN_NONE) {
+ printf("v%"PRIu16, dsts->vlan);
+ }
+ }
+}
+
+static void
+compose_actions(struct bridge *br, const flow_t *flow, uint16_t vlan,
+ const struct port *in_port, const struct port *out_port,
+ tag_type *tags, struct odp_actions *actions)
+{
+ struct dst dsts[DP_MAX_PORTS * (MAX_MIRRORS + 1)];
+ size_t n_dsts;
+ const struct dst *p;
+ uint16_t cur_vlan;
+
+ n_dsts = compose_dsts(br, flow, vlan, in_port, out_port, dsts, tags);
+
+ cur_vlan = ntohs(flow->dl_vlan);
+ for (p = dsts; p < &dsts[n_dsts]; p++) {
+ union odp_action *a;
+ if (p->vlan != cur_vlan) {
+ if (p->vlan == OFP_VLAN_NONE) {
+ odp_actions_add(actions, ODPAT_STRIP_VLAN);
+ } else {
+ a = odp_actions_add(actions, ODPAT_SET_VLAN_VID);
+ a->vlan_vid.vlan_vid = htons(p->vlan);
+ }
+ cur_vlan = p->vlan;
+ }
+ a = odp_actions_add(actions, ODPAT_OUTPUT);
+ a->output.port = p->dp_ifidx;
+ }
+}
+
+static bool
+is_bcast_arp_reply(const flow_t *flow, const struct ofpbuf *packet)
+{
+ struct arp_eth_header *arp = (struct arp_eth_header *) packet->data;
+ return (flow->dl_type == htons(ETH_TYPE_ARP)
+ && eth_addr_is_broadcast(flow->dl_dst)
+ && packet->size >= sizeof(struct arp_eth_header)
+ && arp->ar_op == ARP_OP_REQUEST);
+}
+
+/* If the composed actions may be applied to any packet in the given 'flow',
+ * returns true. Otherwise, the actions should only be applied to 'packet', or
+ * not at all, if 'packet' was NULL. */
+static bool
+process_flow(struct bridge *br, const flow_t *flow,
+ const struct ofpbuf *packet, struct odp_actions *actions,
+ tag_type *tags)
+{
+ struct iface *in_iface;
+ struct port *in_port;
+ struct port *out_port = NULL; /* By default, drop the packet/flow. */
+ int vlan;
+
+ /* Find the interface and port structure for the received packet. */
+ in_iface = iface_from_dp_ifidx(br, flow->in_port);
+ if (!in_iface) {
+ /* No interface? Something fishy... */
+ if (packet != NULL) {
+ /* Odd. A few possible reasons here:
+ *
+ * - We deleted an interface but there are still a few packets
+ * queued up from it.
+ *
+ * - Someone externally added an interface (e.g. with "ovs-dpctl
+ * add-if") that we don't know about.
+ *
+ * - Packet arrived on the local port but the local port is not
+ * one of our bridge ports.
+ */
+ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
+
+ VLOG_WARN_RL(&rl, "bridge %s: received packet on unknown "
+ "interface %"PRIu16, br->name, flow->in_port);
+ }
+
+ /* Return without adding any actions, to drop packets on this flow. */
+ return true;
+ }
+ in_port = in_iface->port;
+
+ /* Figure out what VLAN this packet belongs to.
+ *
+ * Note that dl_vlan of 0 and of OFP_VLAN_NONE both mean that the packet
+ * belongs to VLAN 0, so we should treat both cases identically. (In the
+ * former case, the packet has an 802.1Q header that specifies VLAN 0,
+ * presumably to allow a priority to be specified. In the latter case, the
+ * packet does not have any 802.1Q header.) */
+ vlan = ntohs(flow->dl_vlan);
+ if (vlan == OFP_VLAN_NONE) {
+ vlan = 0;
+ }
+ if (in_port->vlan >= 0) {
+ if (vlan) {
+ /* XXX support double tagging? */
+ if (packet != NULL) {
+ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
+ VLOG_WARN_RL(&rl, "bridge %s: dropping VLAN %"PRIu16" tagged "
+ "packet received on port %s configured with "
+ "implicit VLAN %"PRIu16,
+ br->name, ntohs(flow->dl_vlan),
+ in_port->name, in_port->vlan);
+ }
+ goto done;
+ }
+ vlan = in_port->vlan;
+ } else {
+ if (!port_includes_vlan(in_port, vlan)) {
+ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
+ VLOG_WARN_RL(&rl, "bridge %s: dropping VLAN %d tagged "
+ "packet received on port %s not configured for "
+ "trunking VLAN %d",
+ br->name, vlan, in_port->name, vlan);
+ goto done;
+ }
+ }
+
+ /* Drop frames for ports that STP wants entirely killed (both for
+ * forwarding and for learning). Later, after we do learning, we'll drop
+ * the frames that STP wants to do learning but not forwarding on. */
+ if (in_port->stp_state & (STP_LISTENING | STP_BLOCKING)) {
+ goto done;
+ }
+
+ /* Drop frames for reserved multicast addresses. */
+ if (eth_addr_is_reserved(flow->dl_dst)) {
+ goto done;
+ }
+
+ /* Drop frames on ports reserved for mirroring. */
+ if (in_port->is_mirror_output_port) {
+ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
+ VLOG_WARN_RL(&rl, "bridge %s: dropping packet received on port %s, "
+ "which is reserved exclusively for mirroring",
+ br->name, in_port->name);
+ goto done;
+ }
+
+ /* Drop multicast and broadcast packets on inactive bonded interfaces, to
+ * avoid receiving duplicates. */
+ if (in_port->n_ifaces > 1 && eth_addr_is_multicast(flow->dl_dst)) {
+ *tags |= in_port->active_iface_tag;
+ if (in_port->active_iface != in_iface->port_ifidx) {
+ goto done;
+ }
+ }
+
+ /* MAC learning. */
+ out_port = FLOOD_PORT;
+ if (br->ml) {
+ int out_port_idx;
+ bool may_learn;
+
+ if (!packet) {
+ /* Don't try to learn from revalidation. */
+ may_learn = false;
+ } else if (in_port->n_ifaces > 1) {
+ /* If the packet arrived on a bonded port, don't learn from it
+ * unless we haven't learned any port at all for that address
+ * (because we probably sent the packet on one bonded interface and
+ * got it back on the other). Broadcast ARP replies are an
+ * exception to this rule: the host has moved to another switch. */
+ int src_idx = mac_learning_lookup(br->ml, flow->dl_src, vlan);
+ may_learn = (src_idx < 0
+ || src_idx == in_port->port_idx
+ || is_bcast_arp_reply(flow, packet));
+ } else {
+ may_learn = true;
+ }
+
+ /* Learn source MAC. */
+ if (may_learn) {
+ tag_type rev_tag = mac_learning_learn(br->ml, flow->dl_src,
+ vlan, in_port->port_idx);
+ if (rev_tag) {
+ /* The log messages here could actually be useful in debugging,
+ * so keep the rate limit relatively high. */
+ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(30,
+ 300);
+ VLOG_DBG_RL(&rl, "bridge %s: learned that "ETH_ADDR_FMT" is "
+ "on port %s in VLAN %d",
+ br->name, ETH_ADDR_ARGS(flow->dl_src),
+ in_port->name, vlan);
+ ofproto_revalidate(br->ofproto, rev_tag);
+ }
+ }
+
+ /* Determine output port. */
+ out_port_idx = mac_learning_lookup_tag(br->ml, flow->dl_dst, vlan,
+ tags);
+ if (out_port_idx >= 0 && out_port_idx < br->n_ports) {
+ out_port = br->ports[out_port_idx];
+ }
+ }
+
+ /* Don't send packets out their input ports. Don't forward frames that STP
+ * wants us to discard. */
+ if (in_port == out_port || in_port->stp_state == STP_LEARNING) {
+ out_port = NULL;
+ }
+
+done:
+ compose_actions(br, flow, vlan, in_port, out_port, tags, actions);
+
+ /*
+ * We send out only a single packet, instead of setting up a flow, if the
+ * packet is an ARP directed to broadcast that arrived on a bonded
+ * interface. In such a situation ARP requests and replies must be handled
+ * differently, but OpenFlow unfortunately can't distinguish them.
+ */
+ return (in_port->n_ifaces < 2
+ || flow->dl_type != htons(ETH_TYPE_ARP)
+ || !eth_addr_is_broadcast(flow->dl_dst));
+}
+
+/* Careful: 'opp' is in host byte order and opp->port_no is an OFP port
+ * number. */
+static void
+bridge_port_changed_ofhook_cb(enum ofp_port_reason reason,
+ const struct ofp_phy_port *opp,
+ void *br_)
+{
+ struct bridge *br = br_;
+ struct iface *iface;
+ struct port *port;
+
+ iface = iface_from_dp_ifidx(br, ofp_port_to_odp_port(opp->port_no));
+ if (!iface) {
+ return;
+ }
+ port = iface->port;
+
+ if (reason == OFPPR_DELETE) {
+ VLOG_WARN("bridge %s: interface %s deleted unexpectedly",
+ br->name, iface->name);
+ iface_destroy(iface);
+ if (!port->n_ifaces) {
+ VLOG_WARN("bridge %s: port %s has no interfaces, dropping",
+ br->name, port->name);
+ port_destroy(port);
+ }
+
+ bridge_flush(br);
+ } else {
+ memcpy(iface->mac, opp->hw_addr, ETH_ADDR_LEN);
+ if (port->n_ifaces > 1) {
+ bool up = !(opp->state & OFPPS_LINK_DOWN);
+ bond_link_status_update(iface, up);
+ port_update_bond_compat(port);
+ }
+ }
+}
+
+static bool
+bridge_normal_ofhook_cb(const flow_t *flow, const struct ofpbuf *packet,
+ struct odp_actions *actions, tag_type *tags, void *br_)
+{
+ struct bridge *br = br_;
+
+#if 0
+ if (flow->dl_type == htons(OFP_DL_TYPE_NOT_ETH_TYPE)
+ && eth_addr_equals(flow->dl_dst, stp_eth_addr)) {
+ brstp_receive(br, flow, payload);
+ return true;
+ }
+#endif
+
+ COVERAGE_INC(bridge_process_flow);
+ return process_flow(br, flow, packet, actions, tags);
+}
+
+static void
+bridge_account_flow_ofhook_cb(const flow_t *flow,
+ const union odp_action *actions,
+ size_t n_actions, unsigned long long int n_bytes,
+ void *br_)
+{
+ struct bridge *br = br_;
+ const union odp_action *a;
+
+ if (!br->has_bonded_ports) {
+ return;
+ }
+
+ for (a = actions; a < &actions[n_actions]; a++) {
+ if (a->type == ODPAT_OUTPUT) {
+ struct port *port = port_from_dp_ifidx(br, a->output.port);
+ if (port && port->n_ifaces >= 2) {
+ struct bond_entry *e = lookup_bond_entry(port, flow->dl_src);
+ e->tx_bytes += n_bytes;
+ }
+ }
+ }
+}
+
+static void
+bridge_account_checkpoint_ofhook_cb(void *br_)
+{
+ struct bridge *br = br_;
+ size_t i;
+
+ if (!br->has_bonded_ports) {
+ return;
+ }
+
+ /* The current ofproto implementation calls this callback at least once a
+ * second, so this timer implementation is sufficient. */
+ if (time_msec() < br->bond_next_rebalance) {
+ return;
+ }
+ br->bond_next_rebalance = time_msec() + 10000;
+
+ for (i = 0; i < br->n_ports; i++) {
+ struct port *port = br->ports[i];
+ if (port->n_ifaces > 1) {
+ bond_rebalance_port(port);
+ }
+ }
+}
+
+static struct ofhooks bridge_ofhooks = {
+ bridge_port_changed_ofhook_cb,
+ bridge_normal_ofhook_cb,
+ bridge_account_flow_ofhook_cb,
+ bridge_account_checkpoint_ofhook_cb,
+};
+
+/* Statistics for a single interface on a bonded port, used for load-based
+ * bond rebalancing. */
+struct slave_balance {
+ struct iface *iface; /* The interface. */
+ uint64_t tx_bytes; /* Sum of hashes[*]->tx_bytes. */
+
+ /* All the "bond_entry"s that are assigned to this interface, in order of
+ * increasing tx_bytes. */
+ struct bond_entry **hashes;
+ size_t n_hashes;
+};
+
+/* Sorts pointers to pointers to bond_entries in ascending order by the
+ * interface to which they are assigned, and within a single interface in
+ * ascending order of bytes transmitted. */
+static int
+compare_bond_entries(const void *a_, const void *b_)
+{
+ const struct bond_entry *const *ap = a_;
+ const struct bond_entry *const *bp = b_;
+ const struct bond_entry *a = *ap;
+ const struct bond_entry *b = *bp;
+ if (a->iface_idx != b->iface_idx) {
+ return a->iface_idx > b->iface_idx ? 1 : -1;
+ } else if (a->tx_bytes != b->tx_bytes) {
+ return a->tx_bytes > b->tx_bytes ? 1 : -1;
+ } else {
+ return 0;
+ }
+}
+
+/* Sorts slave_balances so that enabled ports come first, and otherwise in
+ * *descending* order by number of bytes transmitted. */
+static int
+compare_slave_balance(const void *a_, const void *b_)
+{
+ const struct slave_balance *a = a_;
+ const struct slave_balance *b = b_;
+ if (a->iface->enabled != b->iface->enabled) {
+ return a->iface->enabled ? -1 : 1;
+ } else if (a->tx_bytes != b->tx_bytes) {
+ return a->tx_bytes > b->tx_bytes ? -1 : 1;
+ } else {
+ return 0;
+ }
+}
+
+static void
+swap_bals(struct slave_balance *a, struct slave_balance *b)
+{
+ struct slave_balance tmp = *a;
+ *a = *b;
+ *b = tmp;
+}
+
+/* Restores the 'n_bals' slave_balance structures in 'bals' to sorted order
+ * given that 'p' (and only 'p') might be in the wrong location.
+ *
+ * This function invalidates 'p', since it might now be in a different memory
+ * location. */
+static void
+resort_bals(struct slave_balance *p,
+ struct slave_balance bals[], size_t n_bals)
+{
+ if (n_bals > 1) {
+ for (; p > bals && p->tx_bytes > p[-1].tx_bytes; p--) {
+ swap_bals(p, p - 1);
+ }
+ for (; p < &bals[n_bals - 1] && p->tx_bytes < p[1].tx_bytes; p++) {
+ swap_bals(p, p + 1);
+ }
+ }
+}
+
+static void
+log_bals(const struct slave_balance *bals, size_t n_bals, struct port *port)
+{
+ if (VLOG_IS_DBG_ENABLED()) {
+ struct ds ds = DS_EMPTY_INITIALIZER;
+ const struct slave_balance *b;
+
+ for (b = bals; b < bals + n_bals; b++) {
+ size_t i;
+
+ if (b > bals) {
+ ds_put_char(&ds, ',');
+ }
+ ds_put_format(&ds, " %s %"PRIu64"kB",
+ b->iface->name, b->tx_bytes / 1024);
+
+ if (!b->iface->enabled) {
+ ds_put_cstr(&ds, " (disabled)");
+ }
+ if (b->n_hashes > 0) {
+ ds_put_cstr(&ds, " (");
+ for (i = 0; i < b->n_hashes; i++) {
+ const struct bond_entry *e = b->hashes[i];
+ if (i > 0) {
+ ds_put_cstr(&ds, " + ");
+ }
+ ds_put_format(&ds, "h%td: %"PRIu64"kB",
+ e - port->bond_hash, e->tx_bytes / 1024);
+ }
+ ds_put_cstr(&ds, ")");
+ }
+ }
+ VLOG_DBG("bond %s:%s", port->name, ds_cstr(&ds));
+ ds_destroy(&ds);
+ }
+}
+
+/* Shifts 'hash' from 'from' to 'to' within 'port'. */
+static void
+bond_shift_load(struct slave_balance *from, struct slave_balance *to,
+ struct bond_entry *hash)
+{
+ struct port *port = from->iface->port;
+ uint64_t delta = hash->tx_bytes;
+
+ VLOG_INFO("bond %s: shift %"PRIu64"kB of load (with hash %td) "
+ "from %s to %s (now carrying %"PRIu64"kB and "
+ "%"PRIu64"kB load, respectively)",
+ port->name, delta / 1024, hash - port->bond_hash,
+ from->iface->name, to->iface->name,
+ (from->tx_bytes - delta) / 1024,
+ (to->tx_bytes + delta) / 1024);
+
+ /* Delete element from from->hashes.
+ *
+ * We don't bother to add the element to to->hashes because not only would
+ * it require more work, the only purpose it would be to allow that hash to
+ * be migrated to another slave in this rebalancing run, and there is no
+ * point in doing that. */
+ if (from->hashes[0] == hash) {
+ from->hashes++;
+ } else {
+ int i = hash - from->hashes[0];
+ memmove(from->hashes + i, from->hashes + i + 1,
+ (from->n_hashes - (i + 1)) * sizeof *from->hashes);
+ }
+ from->n_hashes--;
+
+ /* Shift load away from 'from' to 'to'. */
+ from->tx_bytes -= delta;
+ to->tx_bytes += delta;
+
+ /* Arrange for flows to be revalidated. */
+ ofproto_revalidate(port->bridge->ofproto, hash->iface_tag);
+ hash->iface_idx = to->iface->port_ifidx;
+ hash->iface_tag = tag_create_random();
+
+}
+
+static void
+bond_rebalance_port(struct port *port)
+{
+ struct slave_balance bals[DP_MAX_PORTS];
+ size_t n_bals;
+ struct bond_entry *hashes[BOND_MASK + 1];
+ struct slave_balance *b, *from, *to;
+ struct bond_entry *e;
+ size_t i;
+
+ /* Sets up 'bals' to describe each of the port's interfaces, sorted in
+ * descending order of tx_bytes, so that bals[0] represents the most
+ * heavily loaded slave and bals[n_bals - 1] represents the least heavily
+ * loaded slave.
+ *
+ * The code is a bit tricky: to avoid dynamically allocating a 'hashes'
+ * array for each slave_balance structure, we sort our local array of
+ * hashes in order by slave, so that all of the hashes for a given slave
+ * become contiguous in memory, and then we point each 'hashes' members of
+ * a slave_balance structure to the start of a contiguous group. */
+ n_bals = port->n_ifaces;
+ for (b = bals; b < &bals[n_bals]; b++) {
+ b->iface = port->ifaces[b - bals];
+ b->tx_bytes = 0;
+ b->hashes = NULL;
+ b->n_hashes = 0;
+ }
+ for (i = 0; i <= BOND_MASK; i++) {
+ hashes[i] = &port->bond_hash[i];
+ }
+ qsort(hashes, BOND_MASK + 1, sizeof *hashes, compare_bond_entries);
+ for (i = 0; i <= BOND_MASK; i++) {
+ e = hashes[i];
+ if (e->iface_idx >= 0 && e->iface_idx < port->n_ifaces) {
+ b = &bals[e->iface_idx];
+ b->tx_bytes += e->tx_bytes;
+ if (!b->hashes) {
+ b->hashes = &hashes[i];
+ }
+ b->n_hashes++;
+ }
+ }
+ qsort(bals, n_bals, sizeof *bals, compare_slave_balance);
+ log_bals(bals, n_bals, port);
+
+ /* Discard slaves that aren't enabled (which were sorted to the back of the
+ * array earlier). */
+ while (!bals[n_bals - 1].iface->enabled) {
+ n_bals--;
+ if (!n_bals) {
+ return;
+ }
+ }
+
+ /* Shift load from the most-loaded slaves to the least-loaded slaves. */
+ to = &bals[n_bals - 1];
+ for (from = bals; from < to; ) {
+ uint64_t overload = from->tx_bytes - to->tx_bytes;
+ if (overload < to->tx_bytes >> 5 || overload < 100000) {
+ /* The extra load on 'from' (and all less-loaded slaves), compared
+ * to that of 'to' (the least-loaded slave), is less than ~3%, or
+ * it is less than ~1Mbps. No point in rebalancing. */
+ break;
+ } else if (from->n_hashes == 1) {
+ /* 'from' only carries a single MAC hash, so we can't shift any
+ * load away from it, even though we want to. */
+ from++;
+ } else {
+ /* 'from' is carrying significantly more load than 'to', and that
+ * load is split across at least two different hashes. Pick a hash
+ * to migrate to 'to' (the least-loaded slave), given that doing so
+ * must not cause 'to''s load to exceed 'from''s load.
+ *
+ * The sort order we use means that we prefer to shift away the
+ * smallest hashes instead of the biggest ones. There is little
+ * reason behind this decision; we could use the opposite sort
+ * order to shift away big hashes ahead of small ones. */
+ size_t i;
+
+ for (i = 0; i < from->n_hashes; i++) {
+ uint64_t delta = from->hashes[i]->tx_bytes;
+ if (to->tx_bytes + delta < from->tx_bytes - delta) {
+ break;
+ }
+ }
+ if (i < from->n_hashes) {
+ bond_shift_load(from, to, from->hashes[i]);
+
+ /* Re-sort 'bals'. Note that this may make 'from' and 'to'
+ * point to different slave_balance structures. It is only
+ * valid to do these two operations in a row at all because we
+ * know that 'from' will not move past 'to' and vice versa. */
+ resort_bals(from, bals, n_bals);
+ resort_bals(to, bals, n_bals);
+ } else {
+ from++;
+ }
+ }
+ }
+
+ /* Implement exponentially weighted moving average. A weight of 1/2 causes
+ * historical data to decay to <1% in 7 rebalancing runs. */
+ for (e = &port->bond_hash[0]; e <= &port->bond_hash[BOND_MASK]; e++) {
+ e->tx_bytes /= 2;
+ }
+}
+
+/* Port functions. */
+
+static void
+port_create(struct bridge *br, const char *name)
+{
+ struct port *port;
+
+ port = xcalloc(1, sizeof *port);
+ port->bridge = br;
+ port->port_idx = br->n_ports;
+ port->vlan = -1;
+ port->trunks = NULL;
+ port->name = xstrdup(name);
+ port->active_iface = -1;
+ port->stp_state = STP_DISABLED;
+ port->stp_state_tag = 0;
+
+ if (br->n_ports >= br->allocated_ports) {
+ br->ports = x2nrealloc(br->ports, &br->allocated_ports,
+ sizeof *br->ports);
+ }
+ br->ports[br->n_ports++] = port;
+
+ VLOG_INFO("created port %s on bridge %s", port->name, br->name);
+ bridge_flush(br);
+}
+
+static void
+port_reconfigure(struct port *port)
+{
+ bool bonded = cfg_has_section("bonding.%s", port->name);
+ struct svec old_ifaces, new_ifaces;
+ unsigned long *trunks;
+ int vlan;
+ size_t i;
+
+ /* Collect old and new interfaces. */
+ svec_init(&old_ifaces);
+ svec_init(&new_ifaces);
+ for (i = 0; i < port->n_ifaces; i++) {
+ svec_add(&old_ifaces, port->ifaces[i]->name);
+ }
+ svec_sort(&old_ifaces);
+ if (bonded) {
+ cfg_get_all_keys(&new_ifaces, "bonding.%s.slave", port->name);
+ if (!new_ifaces.n) {
+ VLOG_ERR("port %s: no interfaces specified for bonded port",
+ port->name);
+ } else if (new_ifaces.n == 1) {
+ VLOG_WARN("port %s: only 1 interface specified for bonded port",
+ port->name);
+ }
+
+ port->updelay = cfg_get_int(0, "bonding.%s.updelay", port->name);
+ if (port->updelay < 0) {
+ port->updelay = 0;
+ }
+ port->downdelay = cfg_get_int(0, "bonding.%s.downdelay", port->name);
+ if (port->downdelay < 0) {
+ port->downdelay = 0;
+ }
+ } else {
+ svec_init(&new_ifaces);
+ svec_add(&new_ifaces, port->name);
+ }
+
+ /* Get rid of deleted interfaces and add new interfaces. */
+ for (i = 0; i < port->n_ifaces; i++) {
+ struct iface *iface = port->ifaces[i];
+ if (!svec_contains(&new_ifaces, iface->name)) {
+ iface_destroy(iface);
+ } else {
+ i++;
+ }
+ }
+ for (i = 0; i < new_ifaces.n; i++) {
+ const char *name = new_ifaces.names[i];
+ if (!svec_contains(&old_ifaces, name)) {
+ iface_create(port, name);
+ }
+ }
+
+ /* Get VLAN tag. */
+ vlan = -1;
+ if (cfg_has("vlan.%s.tag", port->name)) {
+ if (!bonded) {
+ vlan = cfg_get_vlan(0, "vlan.%s.tag", port->name);
+ if (vlan >= 0 && vlan <= 4095) {
+ VLOG_DBG("port %s: assigning VLAN tag %d", port->name, vlan);
+ }
+ } else {
+ /* It's possible that bonded, VLAN-tagged ports make sense. Maybe
+ * they even work as-is. But they have not been tested. */
+ VLOG_WARN("port %s: VLAN tags not supported on bonded ports",
+ port->name);
+ }
+ }
+ if (port->vlan != vlan) {
+ port->vlan = vlan;
+ bridge_flush(port->bridge);
+ }
+
+ /* Get trunked VLANs. */
+ trunks = NULL;
+ if (vlan < 0) {
+ size_t n_trunks, n_errors;
+ size_t i;
+
+ trunks = bitmap_allocate(4096);
+ n_trunks = cfg_count("vlan.%s.trunks", port->name);
+ n_errors = 0;
+ for (i = 0; i < n_trunks; i++) {
+ int trunk = cfg_get_vlan(i, "vlan.%s.trunks", port->name);
+ if (trunk >= 0) {
+ bitmap_set1(trunks, trunk);
+ } else {
+ n_errors++;
+ }
+ }
+ if (n_errors) {
+ VLOG_ERR("port %s: invalid values for %zu trunk VLANs",
+ port->name, n_trunks);
+ }
+ if (n_errors == n_trunks) {
+ if (n_errors) {
+ VLOG_ERR("port %s: no valid trunks, trunking all VLANs",
+ port->name);
+ }
+ bitmap_set_multiple(trunks, 0, 4096, 1);
+ }
+ } else {
+ if (cfg_has("vlan.%s.trunks", port->name)) {
+ VLOG_ERR("ignoring vlan.%s.trunks in favor of vlan.%s.vlan",
+ port->name, port->name);
+ }
+ }
+ if (trunks == NULL
+ ? port->trunks != NULL
+ : port->trunks == NULL || !bitmap_equal(trunks, port->trunks, 4096)) {
+ bridge_flush(port->bridge);
+ }
+ bitmap_free(port->trunks);
+ port->trunks = trunks;
+
+ svec_destroy(&old_ifaces);
+ svec_destroy(&new_ifaces);
+}
+
+static void
+port_destroy(struct port *port)
+{
+ if (port) {
+ struct bridge *br = port->bridge;
+ struct port *del;
+ size_t i;
+
+ proc_net_compat_update_vlan(port->name, NULL, 0);
+
+ for (i = 0; i < MAX_MIRRORS; i++) {
+ struct mirror *m = br->mirrors[i];
+ if (m && m->out_port == port) {
+ mirror_destroy(m);
+ }
+ }
+
+ while (port->n_ifaces > 0) {
+ iface_destroy(port->ifaces[port->n_ifaces - 1]);
+ }
+
+ del = br->ports[port->port_idx] = br->ports[--br->n_ports];
+ del->port_idx = port->port_idx;
+
+ free(port->ifaces);
+ bitmap_free(port->trunks);
+ free(port->name);
+ free(port);
+ bridge_flush(br);
+ }
+}
+
+static struct port *
+port_from_dp_ifidx(const struct bridge *br, uint16_t dp_ifidx)
+{
+ struct iface *iface = iface_from_dp_ifidx(br, dp_ifidx);
+ return iface ? iface->port : NULL;
+}
+
+static struct port *
+port_lookup(const struct bridge *br, const char *name)
+{
+ size_t i;
+
+ for (i = 0; i < br->n_ports; i++) {
+ struct port *port = br->ports[i];
+ if (!strcmp(port->name, name)) {
+ return port;
+ }
+ }
+ return NULL;
+}
+
+static void
+port_update_bonding(struct port *port)
+{
+ if (port->n_ifaces < 2) {
+ /* Not a bonded port. */
+ if (port->bond_hash) {
+ free(port->bond_hash);
+ port->bond_hash = NULL;
+ proc_net_compat_update_bond(port->name, NULL);
+ }
+ } else {
+ if (!port->bond_hash) {
+ size_t i;
+
+ port->bond_hash = xcalloc(BOND_MASK + 1, sizeof *port->bond_hash);
+ for (i = 0; i <= BOND_MASK; i++) {
+ struct bond_entry *e = &port->bond_hash[i];
+ e->iface_idx = -1;
+ e->tx_bytes = 0;
+ }
+ port->no_ifaces_tag = tag_create_random();
+ bond_choose_active_iface(port);
+ }
+ port_update_bond_compat(port);
+ }
+}
+
+static void
+port_update_bond_compat(struct port *port)
+{
+ struct compat_bond bond;
+ size_t i;
+
+ if (port->n_ifaces < 2) {
+ return;
+ }
+
+ bond.up = false;
+ bond.updelay = port->updelay;
+ bond.downdelay = port->downdelay;
+ bond.n_slaves = port->n_ifaces;
+ bond.slaves = xmalloc(port->n_ifaces * sizeof *bond.slaves);
+ for (i = 0; i < port->n_ifaces; i++) {
+ struct iface *iface = port->ifaces[i];
+ struct compat_bond_slave *slave = &bond.slaves[i];
+ slave->name = iface->name;
+ slave->up = ((iface->enabled && iface->delay_expires == LLONG_MAX) ||
+ (!iface->enabled && iface->delay_expires != LLONG_MAX));
+ if (slave->up) {
+ bond.up = true;
+ }
+ memcpy(slave->mac, iface->mac, ETH_ADDR_LEN);
+ }
+ proc_net_compat_update_bond(port->name, &bond);
+ free(bond.slaves);
+}
+
+static void
+port_update_vlan_compat(struct port *port)
+{
+ struct bridge *br = port->bridge;
+ char *vlandev_name = NULL;
+
+ if (port->vlan > 0) {
+ /* Figure out the name that the VLAN device should actually have, if it
+ * existed. This takes some work because the VLAN device would not
+ * have port->name in its name; rather, it would have the trunk port's
+ * name, and 'port' would be attached to a bridge that also had the
+ * VLAN device one of its ports. So we need to find a trunk port that
+ * includes port->vlan.
+ *
+ * There might be more than one candidate. This doesn't happen on
+ * XenServer, so if it happens we just pick the first choice in
+ * alphabetical order instead of creating multiple VLAN devices. */
+ size_t i;
+ for (i = 0; i < br->n_ports; i++) {
+ struct port *p = br->ports[i];
+ if (port_trunks_vlan(p, port->vlan)
+ && p->n_ifaces
+ && (!vlandev_name || strcmp(p->name, vlandev_name) <= 0))
+ {
+ const uint8_t *ea = p->ifaces[0]->mac;
+ if (!eth_addr_is_multicast(ea) &&
+ !eth_addr_is_reserved(ea) &&
+ !eth_addr_is_zero(ea)) {
+ vlandev_name = p->name;
+ }
+ }
+ }
+ }
+ proc_net_compat_update_vlan(port->name, vlandev_name, port->vlan);
+}
+
+/* Interface functions. */
+
+static void
+iface_create(struct port *port, const char *name)
+{
+ enum netdev_flags flags;
+ struct iface *iface;
+
+ iface = xcalloc(1, sizeof *iface);
+ iface->port = port;
+ iface->port_ifidx = port->n_ifaces;
+ iface->name = xstrdup(name);
+ iface->dp_ifidx = -1;
+ iface->tag = tag_create_random();
+ iface->enabled = true;
+ iface->delay_expires = LLONG_MAX;
+
+ netdev_nodev_get_etheraddr(name, iface->mac);
+
+ if (!netdev_nodev_get_flags(name, &flags)) {
+ iface->enabled = (flags & NETDEV_UP) != 0;
+ }
+
+ if (port->n_ifaces >= port->allocated_ifaces) {
+ port->ifaces = x2nrealloc(port->ifaces, &port->allocated_ifaces,
+ sizeof *port->ifaces);
+ }
+ port->ifaces[port->n_ifaces++] = iface;
+ if (port->n_ifaces > 1) {
+ port->bridge->has_bonded_ports = true;
+ }
+
+ VLOG_DBG("attached network device %s to port %s", iface->name, port->name);
+
+ port_update_bonding(port);
+ bridge_flush(port->bridge);
+}
+
+static void
+iface_destroy(struct iface *iface)
+{
+ if (iface) {
+ struct port *port = iface->port;
+ struct bridge *br = port->bridge;
+ bool del_active = port->active_iface == iface->port_ifidx;
+ struct iface *del;
+
+ if (iface->dp_ifidx >= 0) {
+ port_array_set(&br->ifaces, iface->dp_ifidx, NULL);
+ }
+
+ del = port->ifaces[iface->port_ifidx] = port->ifaces[--port->n_ifaces];
+ del->port_ifidx = iface->port_ifidx;
+
+ free(iface->name);
+ free(iface);
+
+ if (del_active) {
+ ofproto_revalidate(port->bridge->ofproto, port->active_iface_tag);
+ bond_choose_active_iface(port);
+ }
+
+ port_update_bonding(port);
+ bridge_flush(port->bridge);
+ }
+}
+
+static struct iface *
+iface_lookup(const struct bridge *br, const char *name)
+{
+ size_t i, j;
+
+ for (i = 0; i < br->n_ports; i++) {
+ struct port *port = br->ports[i];
+ for (j = 0; j < port->n_ifaces; j++) {
+ struct iface *iface = port->ifaces[j];
+ if (!strcmp(iface->name, name)) {
+ return iface;
+ }
+ }
+ }
+ return NULL;
+}
+
+static struct iface *
+iface_from_dp_ifidx(const struct bridge *br, uint16_t dp_ifidx)
+{
+ return port_array_get(&br->ifaces, dp_ifidx);
+}
+
+/* Port mirroring. */
+
+static void
+mirror_reconfigure(struct bridge *br)
+{
+ struct svec old_mirrors, new_mirrors;
+ size_t i;
+
+ /* Collect old and new mirrors. */
+ svec_init(&old_mirrors);
+ svec_init(&new_mirrors);
+ cfg_get_subsections(&new_mirrors, "mirror.%s", br->name);
+ for (i = 0; i < MAX_MIRRORS; i++) {
+ if (br->mirrors[i]) {
+ svec_add(&old_mirrors, br->mirrors[i]->name);
+ }
+ }
+
+ /* Get rid of deleted mirrors and add new mirrors. */
+ svec_sort(&old_mirrors);
+ assert(svec_is_unique(&old_mirrors));
+ svec_sort(&new_mirrors);
+ assert(svec_is_unique(&new_mirrors));
+ for (i = 0; i < MAX_MIRRORS; i++) {
+ struct mirror *m = br->mirrors[i];
+ if (m && !svec_contains(&new_mirrors, m->name)) {
+ mirror_destroy(m);
+ }
+ }
+ for (i = 0; i < new_mirrors.n; i++) {
+ const char *name = new_mirrors.names[i];
+ if (!svec_contains(&old_mirrors, name)) {
+ mirror_create(br, name);
+ }
+ }
+ svec_destroy(&old_mirrors);
+ svec_destroy(&new_mirrors);
+
+ /* Reconfigure all mirrors. */
+ for (i = 0; i < MAX_MIRRORS; i++) {
+ if (br->mirrors[i]) {
+ mirror_reconfigure_one(br->mirrors[i]);
+ }
+ }
+
+ /* Update port reserved status. */
+ for (i = 0; i < br->n_ports; i++) {
+ br->ports[i]->is_mirror_output_port = false;
+ }
+ for (i = 0; i < MAX_MIRRORS; i++) {
+ struct mirror *m = br->mirrors[i];
+ if (m && m->out_port) {
+ m->out_port->is_mirror_output_port = true;
+ }
+ }
+}
+
+static void
+mirror_create(struct bridge *br, const char *name)
+{
+ struct mirror *m;
+ size_t i;
+
+ for (i = 0; ; i++) {
+ if (i >= MAX_MIRRORS) {
+ VLOG_WARN("bridge %s: maximum of %d port mirrors reached, "
+ "cannot create %s", br->name, MAX_MIRRORS, name);
+ return;
+ }
+ if (!br->mirrors[i]) {
+ break;
+ }
+ }
+
+ VLOG_INFO("created port mirror %s on bridge %s", name, br->name);
+ bridge_flush(br);
+
+ br->mirrors[i] = m = xcalloc(1, sizeof *m);
+ m->bridge = br;
+ m->idx = i;
+ m->name = xstrdup(name);
+ svec_init(&m->src_ports);
+ svec_init(&m->dst_ports);
+ m->vlans = NULL;
+ m->n_vlans = 0;
+ m->out_vlan = -1;
+ m->out_port = NULL;
+}
+
+static void
+mirror_destroy(struct mirror *m)
+{
+ if (m) {
+ struct bridge *br = m->bridge;
+ size_t i;
+
+ for (i = 0; i < br->n_ports; i++) {
+ br->ports[i]->src_mirrors &= ~(MIRROR_MASK_C(1) << m->idx);
+ br->ports[i]->dst_mirrors &= ~(MIRROR_MASK_C(1) << m->idx);
+ }
+
+ svec_destroy(&m->src_ports);
+ svec_destroy(&m->dst_ports);
+ free(m->vlans);
+
+ m->bridge->mirrors[m->idx] = NULL;
+ free(m);
+
+ bridge_flush(br);
+ }
+}
+
+static void
+prune_ports(struct mirror *m, struct svec *ports)
+{
+ struct svec tmp;
+ size_t i;
+
+ svec_sort_unique(ports);
+
+ svec_init(&tmp);
+ for (i = 0; i < ports->n; i++) {
+ const char *name = ports->names[i];
+ if (port_lookup(m->bridge, name)) {
+ svec_add(&tmp, name);
+ } else {
+ VLOG_WARN("mirror.%s.%s: cannot match on nonexistent port %s",
+ m->bridge->name, m->name, name);
+ }
+ }
+ svec_swap(ports, &tmp);
+ svec_destroy(&tmp);
+}
+
+static size_t
+prune_vlans(struct mirror *m, struct svec *vlan_strings, int **vlans)
+{
+ size_t n_vlans, i;
+
+ /* This isn't perfect: it won't combine "0" and "00", and the textual sort
+ * order won't give us numeric sort order. But that's good enough for what
+ * we need right now. */
+ svec_sort_unique(vlan_strings);
+
+ *vlans = xmalloc(sizeof *vlans * vlan_strings->n);
+ n_vlans = 0;
+ for (i = 0; i < vlan_strings->n; i++) {
+ const char *name = vlan_strings->names[i];
+ int vlan;
+ if (!str_to_int(name, 10, &vlan) || vlan < 0 || vlan > 4095) {
+ VLOG_WARN("mirror.%s.%s.select.vlan: ignoring invalid VLAN %s",
+ m->bridge->name, m->name, name);
+ } else {
+ (*vlans)[n_vlans++] = vlan;
+ }
+ }
+ return n_vlans;
+}
+
+static bool
+vlan_is_mirrored(const struct mirror *m, int vlan)
+{
+ size_t i;
+
+ for (i = 0; i < m->n_vlans; i++) {
+ if (m->vlans[i] == vlan) {
+ return true;
+ }
+ }
+ return false;
+}
+
+static bool
+port_trunks_any_mirrored_vlan(const struct mirror *m, const struct port *p)
+{
+ size_t i;
+
+ for (i = 0; i < m->n_vlans; i++) {
+ if (port_trunks_vlan(p, m->vlans[i])) {
+ return true;
+ }
+ }
+ return false;
+}
+
+static void
+mirror_reconfigure_one(struct mirror *m)
+{
+ char *pfx = xasprintf("mirror.%s.%s", m->bridge->name, m->name);
+ struct svec src_ports, dst_ports, ports;
+ struct svec vlan_strings;
+ mirror_mask_t mirror_bit;
+ const char *out_port_name;
+ struct port *out_port;
+ int out_vlan;
+ size_t n_vlans;
+ int *vlans;
+ size_t i;
+ bool mirror_all_ports;
+
+ /* Get output port. */
+ out_port_name = cfg_get_key(0, "mirror.%s.%s.output.port",
+ m->bridge->name, m->name);
+ if (out_port_name) {
+ out_port = port_lookup(m->bridge, out_port_name);
+ if (!out_port) {
+ VLOG_ERR("%s.output.port: bridge %s does not have a port "
+ "named %s", pfx, m->bridge->name, out_port_name);
+ mirror_destroy(m);
+ free(pfx);
+ return;
+ }
+ out_vlan = -1;
+
+ if (cfg_has("%s.output.vlan", pfx)) {
+ VLOG_ERR("%s.output.port and %s.output.vlan both specified; "
+ "ignoring %s.output.vlan", pfx, pfx, pfx);
+ }
+ } else if (cfg_has("%s.output.vlan", pfx)) {
+ out_port = NULL;
+ out_vlan = cfg_get_vlan(0, "%s.output.vlan", pfx);
+ } else {
+ VLOG_ERR("%s: neither %s.output.port nor %s.output.vlan specified, "
+ "but exactly one is required; disabling port mirror %s",
+ pfx, pfx, pfx, pfx);
+ mirror_destroy(m);
+ free(pfx);
+ return;
+ }
+
+ /* Get all the ports, and drop duplicates and ports that don't exist. */
+ svec_init(&src_ports);
+ svec_init(&dst_ports);
+ svec_init(&ports);
+ cfg_get_all_keys(&src_ports, "%s.select.src-port", pfx);
+ cfg_get_all_keys(&dst_ports, "%s.select.dst-port", pfx);
+ cfg_get_all_keys(&ports, "%s.select.port", pfx);
+ svec_append(&src_ports, &ports);
+ svec_append(&dst_ports, &ports);
+ svec_destroy(&ports);
+ prune_ports(m, &src_ports);
+ prune_ports(m, &dst_ports);
+
+ /* Get all the vlans, and drop duplicate and invalid vlans. */
+ svec_init(&vlan_strings);
+ cfg_get_all_keys(&vlan_strings, "%s.select.vlan", pfx);
+ n_vlans = prune_vlans(m, &vlan_strings, &vlans);
+ svec_destroy(&vlan_strings);
+
+ /* Update mirror data. */
+ if (!svec_equal(&m->src_ports, &src_ports)
+ || !svec_equal(&m->dst_ports, &dst_ports)
+ || m->n_vlans != n_vlans
+ || memcmp(m->vlans, vlans, sizeof *vlans * n_vlans)
+ || m->out_port != out_port
+ || m->out_vlan != out_vlan) {
+ bridge_flush(m->bridge);
+ }
+ svec_swap(&m->src_ports, &src_ports);
+ svec_swap(&m->dst_ports, &dst_ports);
+ free(m->vlans);
+ m->vlans = vlans;
+ m->n_vlans = n_vlans;
+ m->out_port = out_port;
+ m->out_vlan = out_vlan;
+
+ /* If no selection criteria have been given, mirror for all ports. */
+ mirror_all_ports = (!m->src_ports.n) && (!m->dst_ports.n) && (!m->n_vlans);
+
+ /* Update ports. */
+ mirror_bit = MIRROR_MASK_C(1) << m->idx;
+ for (i = 0; i < m->bridge->n_ports; i++) {
+ struct port *port = m->bridge->ports[i];
+
+ if (mirror_all_ports
+ || svec_contains(&m->src_ports, port->name)
+ || (m->n_vlans
+ && (!port->vlan
+ ? port_trunks_any_mirrored_vlan(m, port)
+ : vlan_is_mirrored(m, port->vlan)))) {
+ port->src_mirrors |= mirror_bit;
+ } else {
+ port->src_mirrors &= ~mirror_bit;
+ }
+
+ if (mirror_all_ports || svec_contains(&m->dst_ports, port->name)) {
+ port->dst_mirrors |= mirror_bit;
+ } else {
+ port->dst_mirrors &= ~mirror_bit;
+ }
+ }
+
+ /* Clean up. */
+ svec_destroy(&src_ports);
+ svec_destroy(&dst_ports);
+ free(pfx);
+}
+
+/* Spanning tree protocol. */
+
+static void brstp_update_port_state(struct port *);
+
+static void
+brstp_send_bpdu(struct ofpbuf *pkt, int port_no, void *br_)
+{
+ struct bridge *br = br_;
+ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
+ struct iface *iface = iface_from_dp_ifidx(br, port_no);
+ if (!iface) {
+ VLOG_WARN_RL(&rl, "%s: cannot send BPDU on unknown port %d",
+ br->name, port_no);
+ } else if (eth_addr_is_zero(iface->mac)) {
+ VLOG_WARN_RL(&rl, "%s: cannot send BPDU on port %d with unknown MAC",
+ br->name, port_no);
+ } else {
+ union ofp_action action;
+ struct eth_header *eth = pkt->l2;
+ flow_t flow;
+
+ memcpy(eth->eth_src, iface->mac, ETH_ADDR_LEN);
+
+ memset(&action, 0, sizeof action);
+ action.type = htons(OFPAT_OUTPUT);
+ action.output.len = htons(sizeof action);
+ action.output.port = htons(port_no);
+
+ flow_extract(pkt, ODPP_NONE, &flow);
+ ofproto_send_packet(br->ofproto, &flow, &action, 1, pkt);
+ }
+ ofpbuf_delete(pkt);
+}
+
+static void
+brstp_reconfigure(struct bridge *br)
+{
+ size_t i;
+
+ if (!cfg_get_bool(0, "stp.%s.enabled", br->name)) {
+ if (br->stp) {
+ stp_destroy(br->stp);
+ br->stp = NULL;
+
+ bridge_flush(br);
+ }
+ } else {
+ uint64_t bridge_address, bridge_id;
+ int bridge_priority;
+
+ bridge_address = cfg_get_mac(0, "stp.%s.address", br->name);
+ if (!bridge_address) {
+ if (br->stp) {
+ bridge_address = (stp_get_bridge_id(br->stp)
+ & ((UINT64_C(1) << 48) - 1));
+ } else {
+ uint8_t mac[ETH_ADDR_LEN];
+ eth_addr_random(mac);
+ bridge_address = eth_addr_to_uint64(mac);
+ }
+ }
+
+ if (cfg_is_valid(CFG_INT | CFG_REQUIRED, "stp.%s.priority",
+ br->name)) {
+ bridge_priority = cfg_get_int(0, "stp.%s.priority", br->name);
+ } else {
+ bridge_priority = STP_DEFAULT_BRIDGE_PRIORITY;
+ }
+
+ bridge_id = bridge_address | ((uint64_t) bridge_priority << 48);
+ if (!br->stp) {
+ br->stp = stp_create(br->name, bridge_id, brstp_send_bpdu, br);
+ br->stp_last_tick = time_msec();
+ bridge_flush(br);
+ } else {
+ if (bridge_id != stp_get_bridge_id(br->stp)) {
+ stp_set_bridge_id(br->stp, bridge_id);
+ bridge_flush(br);
+ }
+ }
+
+ for (i = 0; i < br->n_ports; i++) {
+ struct port *p = br->ports[i];
+ int dp_ifidx;
+ struct stp_port *sp;
+ int path_cost, priority;
+ bool enable;
+
+ if (!p->n_ifaces) {
+ continue;
+ }
+ dp_ifidx = p->ifaces[0]->dp_ifidx;
+ if (dp_ifidx < 0 || dp_ifidx >= STP_MAX_PORTS) {
+ continue;
+ }
+
+ sp = stp_get_port(br->stp, dp_ifidx);
+ enable = (!cfg_is_valid(CFG_BOOL | CFG_REQUIRED,
+ "stp.%s.port.%s.enabled",
+ br->name, p->name)
+ || cfg_get_bool(0, "stp.%s.port.%s.enabled",
+ br->name, p->name));
+ if (p->is_mirror_output_port) {
+ enable = false;
+ }
+ if (enable != (stp_port_get_state(sp) != STP_DISABLED)) {
+ bridge_flush(br); /* Might not be necessary. */
+ if (enable) {
+ stp_port_enable(sp);
+ } else {
+ stp_port_disable(sp);
+ }
+ }
+
+ path_cost = cfg_get_int(0, "stp.%s.port.%s.path-cost",
+ br->name, p->name);
+ stp_port_set_path_cost(sp, path_cost ? path_cost : 19 /* XXX */);
+
+ priority = (cfg_is_valid(CFG_INT | CFG_REQUIRED,
+ "stp.%s.port.%s.priority",
+ br->name, p->name)
+ ? cfg_get_int(0, "stp.%s.port.%s.priority",
+ br->name, p->name)
+ : STP_DEFAULT_PORT_PRIORITY);
+ stp_port_set_priority(sp, priority);
+ }
+
+ brstp_adjust_timers(br);
+ }
+ for (i = 0; i < br->n_ports; i++) {
+ brstp_update_port_state(br->ports[i]);
+ }
+}
+
+static void
+brstp_update_port_state(struct port *p)
+{
+ struct bridge *br = p->bridge;
+ enum stp_state state;
+
+ /* Figure out new state. */
+ state = STP_DISABLED;
+ if (br->stp && p->n_ifaces > 0) {
+ int dp_ifidx = p->ifaces[0]->dp_ifidx;
+ if (dp_ifidx >= 0 && dp_ifidx < STP_MAX_PORTS) {
+ state = stp_port_get_state(stp_get_port(br->stp, dp_ifidx));
+ }
+ }
+
+ /* Update state. */
+ if (p->stp_state != state) {
+ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(10, 10);
+ VLOG_INFO_RL(&rl, "port %s: STP state changed from %s to %s",
+ p->name, stp_state_name(p->stp_state),
+ stp_state_name(state));
+ if (p->stp_state == STP_DISABLED) {
+ bridge_flush(br);
+ } else {
+ ofproto_revalidate(p->bridge->ofproto, p->stp_state_tag);
+ }
+ p->stp_state = state;
+ p->stp_state_tag = (p->stp_state == STP_DISABLED ? 0
+ : tag_create_random());
+ }
+}
+
+static void
+brstp_adjust_timers(struct bridge *br)
+{
+ int hello_time = cfg_get_int(0, "stp.%s.hello-time", br->name);
+ int max_age = cfg_get_int(0, "stp.%s.max-age", br->name);
+ int forward_delay = cfg_get_int(0, "stp.%s.forward-delay", br->name);
+
+ stp_set_hello_time(br->stp, hello_time ? hello_time : 2000);
+ stp_set_max_age(br->stp, max_age ? max_age : 20000);
+ stp_set_forward_delay(br->stp, forward_delay ? forward_delay : 15000);
+}
+
+static void
+brstp_run(struct bridge *br)
+{
+ if (br->stp) {
+ long long int now = time_msec();
+ long long int elapsed = now - br->stp_last_tick;
+ struct stp_port *sp;
+
+ if (elapsed > 0) {
+ stp_tick(br->stp, MIN(INT_MAX, elapsed));
+ br->stp_last_tick = now;
+ }
+ while (stp_get_changed_port(br->stp, &sp)) {
+ struct port *p = port_from_dp_ifidx(br, stp_port_no(sp));
+ if (p) {
+ brstp_update_port_state(p);
+ }
+ }
+ }
+}
+
+static void
+brstp_wait(struct bridge *br)
+{
+ if (br->stp) {
+ poll_timer_wait(1000);
+ }
+}
diff --git a/vswitchd/bridge.h b/vswitchd/bridge.h
new file mode 100644
index 00000000..b9435370
--- /dev/null
+++ b/vswitchd/bridge.h
@@ -0,0 +1,43 @@
+/* Copyright (c) 2008, 2009 Nicira Networks
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * In addition, as a special exception, Nicira Networks gives permission
+ * to link the code of its release of vswitchd with the OpenSSL project's
+ * "OpenSSL" library (or with modified versions of it that use the same
+ * license as the "OpenSSL" library), and distribute the linked
+ * executables. You must obey the GNU General Public License in all
+ * respects for all of the code used other than "OpenSSL". If you modify
+ * this file, you may extend this exception to your version of the file,
+ * but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version.
+ */
+
+#ifndef VSWITCHD_BRIDGE_H
+#define VSWITCHD_BRIDGE_H 1
+
+#include <stddef.h>
+#include "list.h"
+
+struct svec;
+
+void bridge_init(void);
+void bridge_reconfigure(void);
+int bridge_run(void);
+void bridge_wait(void);
+bool bridge_exists(const char *);
+uint64_t bridge_get_datapathid(const char *name);
+void bridge_get_ifaces(struct svec *svec);
+
+#endif /* bridge.h */
diff --git a/vswitchd/mgmt.c b/vswitchd/mgmt.c
new file mode 100644
index 00000000..f5dcd184
--- /dev/null
+++ b/vswitchd/mgmt.c
@@ -0,0 +1,679 @@
+/* Copyright (c) 2009 Nicira Networks
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * In addition, as a special exception, Nicira Networks gives permission
+ * to link the code of its release of vswitchd with the OpenSSL project's
+ * "OpenSSL" library (or with modified versions of it that use the same
+ * license as the "OpenSSL" library), and distribute the linked
+ * executables. You must obey the GNU General Public License in all
+ * respects for all of the code used other than "OpenSSL". If you modify
+ * this file, you may extend this exception to your version of the file,
+ * but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version.
+ *
+ */
+
+#include <config.h>
+
+#include <arpa/inet.h>
+#include <assert.h>
+#include <errno.h>
+#include <stdlib.h>
+
+#include "bridge.h"
+#include "cfg.h"
+#include "coverage.h"
+#include "list.h"
+#include "mgmt.h"
+#include "openflow/nicira-ext.h"
+#include "openflow/openflow.h"
+#include "openflow/openflow-mgmt.h"
+#include "ofpbuf.h"
+#include "ovs-vswitchd.h"
+#include "packets.h"
+#include "rconn.h"
+#include "svec.h"
+#include "vconn.h"
+#include "vconn-ssl.h"
+#include "xtoxll.h"
+
+#define THIS_MODULE VLM_mgmt
+#include "vlog.h"
+
+#define MAX_BACKOFF_DEFAULT 15
+#define INACTIVITY_PROBE_DEFAULT 15
+
+static struct svec mgmt_cfg;
+static uint8_t cfg_cookie[CFG_COOKIE_LEN];
+static struct rconn *mgmt_rconn;
+static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(60, 60);
+static struct svec capabilities;
+uint64_t mgmt_id;
+
+
+#define TXQ_LIMIT 128 /* Max number of packets to queue for tx. */
+struct rconn_packet_counter *txqlen; /* # pkts queued for tx on mgmt_rconn. */
+
+static uint64_t pick_fallback_mgmt_id(void);
+static void send_config_update(uint32_t xid, bool use_xid);
+static void send_resources_update(uint32_t xid, bool use_xid);
+
+void
+mgmt_init(void)
+{
+ txqlen = rconn_packet_counter_create();
+
+ svec_init(&mgmt_cfg);
+ svec_init(&capabilities);
+ svec_add_nocopy(&capabilities,
+ xasprintf("com.nicira.mgmt.manager=true\n"));
+
+ mgmt_id = cfg_get_dpid(0, "mgmt.id");
+ if (!mgmt_id) {
+ /* Randomly generate a mgmt id */
+ mgmt_id = pick_fallback_mgmt_id();
+ }
+}
+
+#ifdef HAVE_OPENSSL
+static bool
+config_string_change(const char *key, char **valuep)
+{
+ const char *value = cfg_get_string(0, "%s", key);
+ if (value && (!*valuep || strcmp(value, *valuep))) {
+ free(*valuep);
+ *valuep = xstrdup(value);
+ return true;
+ } else {
+ return false;
+ }
+}
+
+static void
+mgmt_configure_ssl(void)
+{
+ static char *private_key_file;
+ static char *certificate_file;
+ static char *cacert_file;
+
+ /* XXX SSL should be configurable separate from the bridges.
+ * XXX should be possible to de-configure SSL. */
+ if (config_string_change("ssl.private-key", &private_key_file)) {
+ vconn_ssl_set_private_key_file(private_key_file);
+ }
+
+ if (config_string_change("ssl.certificate", &certificate_file)) {
+ vconn_ssl_set_certificate_file(certificate_file);
+ }
+
+ if (config_string_change("ssl.ca-cert", &cacert_file)) {
+ vconn_ssl_set_ca_cert_file(cacert_file,
+ cfg_get_bool(0, "ssl.bootstrap-ca-cert"));
+ }
+}
+#endif
+
+void
+mgmt_reconfigure(void)
+{
+ struct svec new_cfg;
+ uint8_t new_cookie[CFG_COOKIE_LEN];
+ bool cfg_updated = false;
+ const char *controller_name;
+ int max_backoff;
+ int inactivity_probe;
+ int retval;
+
+ if (!cfg_has_section("mgmt")) {
+ if (mgmt_rconn) {
+ rconn_destroy(mgmt_rconn);
+ mgmt_rconn = NULL;
+ }
+ return;
+ }
+
+ /* If this is an established connection, send a resources update. */
+ /* xxx This is wasteful if there were no resource changes!!! */
+ if (mgmt_rconn) {
+ send_resources_update(0, false);
+ }
+
+ cfg_get_cookie(new_cookie);
+ if (memcmp(cfg_cookie, new_cookie, sizeof(cfg_cookie))) {
+ memcpy(cfg_cookie, new_cookie, sizeof(cfg_cookie));
+ cfg_updated = true;
+ }
+
+ svec_init(&new_cfg);
+ cfg_get_section(&new_cfg, "mgmt");
+ if (svec_equal(&mgmt_cfg, &new_cfg)) {
+ /* Reconnecting to the controller causes the config file to be
+ * resent automatically. If we're not reconnecting and the
+ * config file has changed, we need to notify the controller of
+ * changes. */
+ if (cfg_updated && mgmt_rconn) {
+ send_config_update(0, false);
+ }
+ svec_destroy(&new_cfg);
+ return;
+ }
+
+ controller_name = cfg_get_string(0, "mgmt.controller");
+ if (!controller_name) {
+ VLOG_ERR("no controller specified for managment");
+ svec_destroy(&new_cfg);
+ return;
+ }
+
+ max_backoff = cfg_get_int(0, "mgmt.max-backoff");
+ if (max_backoff < 1) {
+ max_backoff = MAX_BACKOFF_DEFAULT;
+ } else if (max_backoff > 3600) {
+ max_backoff = 3600;
+ }
+
+ inactivity_probe = cfg_get_int(0, "mgmt.inactivity-probe");
+ if (inactivity_probe < 5) {
+ inactivity_probe = INACTIVITY_PROBE_DEFAULT;
+ }
+
+ /* xxx If this changes, we need to restart bridges to use new id,
+ * xxx but they need the id before the connect to controller, but we
+ * xxx need their dpids. */
+ /* Check if a different mgmt id has been assigned. */
+ if (cfg_has("mgmt.id")) {
+ uint64_t cfg_mgmt_id = cfg_get_dpid(0, "mgmt.id");
+ if (cfg_mgmt_id != mgmt_id) {
+ mgmt_id = cfg_mgmt_id;
+ }
+ }
+
+ svec_swap(&new_cfg, &mgmt_cfg);
+ svec_destroy(&new_cfg);
+
+#ifdef HAVE_OPENSSL
+ /* Configure SSL. */
+ mgmt_configure_ssl();
+#endif
+
+ if (mgmt_rconn) {
+ rconn_destroy(mgmt_rconn);
+ mgmt_rconn = NULL;
+ }
+ mgmt_rconn = rconn_create(inactivity_probe, max_backoff);
+ retval = rconn_connect(mgmt_rconn, controller_name);
+ if (retval == EAFNOSUPPORT) {
+ VLOG_ERR("no support for %s vconn", controller_name);
+ }
+}
+
+static int
+send_openflow_buffer(struct ofpbuf *buffer)
+{
+ int retval;
+
+ if (!mgmt_rconn) {
+ VLOG_ERR("attempt to send openflow packet with no rconn\n");
+ return EINVAL;
+ }
+
+ update_openflow_length(buffer);
+ retval = rconn_send_with_limit(mgmt_rconn, buffer, txqlen, TXQ_LIMIT);
+ if (retval) {
+ VLOG_WARN_RL(&rl, "send to %s failed: %s",
+ rconn_get_name(mgmt_rconn), strerror(retval));
+ }
+ return retval;
+}
+
+static void
+send_features_reply(uint32_t xid)
+{
+ struct ofpbuf *buffer;
+ struct ofp_switch_features *ofr;
+
+ ofr = make_openflow_xid(sizeof *ofr, OFPT_FEATURES_REPLY, xid, &buffer);
+ ofr->datapath_id = 0;
+ ofr->n_tables = 0;
+ ofr->n_buffers = 0;
+ ofr->capabilities = 0;
+ ofr->actions = 0;
+ send_openflow_buffer(buffer);
+}
+
+static void *
+make_ofmp_xid(size_t ofmp_len, uint16_t type, uint32_t xid,
+ struct ofpbuf **bufferp)
+{
+ struct ofmp_header *oh;
+
+ oh = make_openflow_xid(ofmp_len, OFPT_VENDOR, xid, bufferp);
+ oh->header.vendor = htonl(NX_VENDOR_ID);
+ oh->header.subtype = htonl(NXT_MGMT);
+ oh->type = htons(type);
+
+ return oh;
+}
+
+static void *
+make_ofmp(size_t ofmp_len, uint16_t type, struct ofpbuf **bufferp)
+{
+ struct ofmp_header *oh;
+
+ oh = make_openflow(ofmp_len, OFPT_VENDOR, bufferp);
+ oh->header.vendor = htonl(NX_VENDOR_ID);
+ oh->header.subtype = htonl(NXT_MGMT);
+ oh->type = htons(type);
+
+ return oh;
+}
+
+static void
+send_capability_reply(uint32_t xid)
+{
+ int i;
+ struct ofpbuf *buffer;
+ struct ofmp_capability_reply *ofmpcr;
+
+ ofmpcr = make_ofmp_xid(sizeof *ofmpcr, OFMPT_CAPABILITY_REPLY,
+ xid, &buffer);
+ ofmpcr->format = htonl(OFMPCOF_SIMPLE);
+ ofmpcr->mgmt_id = htonll(mgmt_id);
+ for (i=0; i<capabilities.n; i++) {
+ ofpbuf_put(buffer, capabilities.names[i],
+ strlen(capabilities.names[i]));
+ }
+ send_openflow_buffer(buffer);
+}
+
+static void
+send_resources_update(uint32_t xid, bool use_xid)
+{
+ struct ofpbuf *buffer;
+ struct ofmp_resources_update *ofmpru;
+ struct ofmp_tlv *tlv;
+ struct svec br_list;
+ int i;
+
+ if (use_xid) {
+ ofmpru = make_ofmp_xid(sizeof *ofmpru, OFMPT_RESOURCES_UPDATE,
+ xid, &buffer);
+ } else {
+ ofmpru = make_ofmp(sizeof *ofmpru, OFMPT_RESOURCES_UPDATE, &buffer);
+ }
+
+ svec_init(&br_list);
+ cfg_get_subsections(&br_list, "bridge");
+ for (i=0; i < br_list.n; i++) {
+ struct ofmptsr_dp *dp_tlv;
+ uint64_t dp_id = bridge_get_datapathid(br_list.names[i]);
+ if (!dp_id) {
+ VLOG_WARN_RL(&rl, "bridge %s doesn't seem to exist",
+ br_list.names[i]);
+ continue;
+ }
+ dp_tlv = ofpbuf_put_zeros(buffer, sizeof(*dp_tlv));
+ dp_tlv->type = htons(OFMPTSR_DP);
+ dp_tlv->len = htons(sizeof(*dp_tlv));
+
+ dp_tlv->dp_id = htonll(dp_id);
+ memcpy(dp_tlv->name, br_list.names[i], strlen(br_list.names[i])+1);
+ }
+
+ /* Put end marker. */
+ tlv = ofpbuf_put_zeros(buffer, sizeof(*tlv));
+ tlv->type = htons(OFMPTSR_END);
+ tlv->len = htons(sizeof(*tlv));
+ send_openflow_buffer(buffer);
+}
+
+static void
+send_config_update(uint32_t xid, bool use_xid)
+{
+ struct ofpbuf *buffer;
+ struct ofmp_config_update *ofmpcu;
+
+ if (use_xid) {
+ ofmpcu = make_ofmp_xid(sizeof *ofmpcu, OFMPT_CONFIG_UPDATE,
+ xid, &buffer);
+ } else {
+ ofmpcu = make_ofmp(sizeof *ofmpcu, OFMPT_CONFIG_UPDATE, &buffer);
+ }
+
+ ofmpcu->format = htonl(OFMPCOF_SIMPLE);
+ memcpy(ofmpcu->cookie, cfg_cookie, sizeof(ofmpcu->cookie));
+ cfg_buf_put(buffer);
+ send_openflow_buffer(buffer);
+}
+
+static void
+send_config_update_ack(uint32_t xid, bool success)
+{
+ struct ofpbuf *buffer;
+ struct ofmp_config_update_ack *ofmpcua;
+
+ ofmpcua = make_ofmp_xid(sizeof *ofmpcua, OFMPT_CONFIG_UPDATE_ACK,
+ xid, &buffer);
+
+ ofmpcua->format = htonl(OFMPCOF_SIMPLE);
+ if (success) {
+ ofmpcua->flags = htonl(OFMPCUAF_SUCCESS);
+ }
+ cfg_get_cookie(ofmpcua->cookie);
+ send_openflow_buffer(buffer);
+}
+
+static void
+send_ofmp_error_msg(uint32_t xid, uint16_t type, uint16_t code,
+ const void *data, size_t len)
+{
+ struct ofpbuf *buffer;
+ struct ofmp_error_msg *oem;
+
+ oem = make_ofmp_xid(sizeof(*oem)+len, OFMPT_ERROR, xid, &buffer);
+ oem->type = htons(type);
+ oem->code = htons(code);
+ memcpy(oem->data, data, len);
+ send_openflow_buffer(buffer);
+}
+
+static void
+send_error_msg(uint32_t xid, uint16_t type, uint16_t code,
+ const void *data, size_t len)
+{
+ struct ofpbuf *buffer;
+ struct ofp_error_msg *oem;
+
+ oem = make_openflow_xid(sizeof(*oem)+len, OFPT_ERROR, xid, &buffer);
+ oem->type = htons(type);
+ oem->code = htons(code);
+ memcpy(oem->data, data, len);
+ send_openflow_buffer(buffer);
+}
+
+static int
+recv_echo_request(uint32_t xid UNUSED, const void *msg)
+{
+ const struct ofp_header *rq = msg;
+ send_openflow_buffer(make_echo_reply(rq));
+ return 0;
+}
+
+static int
+recv_features_request(uint32_t xid, const void *msg UNUSED)
+{
+ send_features_reply(xid);
+ return 0;
+}
+
+static int
+recv_set_config(uint32_t xid UNUSED, const void *msg UNUSED)
+{
+ /* Nothing to configure! */
+ return 0;
+}
+
+static int
+recv_ofmp_capability_request(uint32_t xid, const struct ofmp_header *ofmph)
+{
+ struct ofmp_capability_request *ofmpcr;
+
+ if (htons(ofmph->header.header.length) != sizeof(*ofmpcr)) {
+ /* xxx Send error */
+ return -EINVAL;
+ }
+
+ ofmpcr = (struct ofmp_capability_request *)ofmph;
+ if (ofmpcr->format != htonl(OFMPCAF_SIMPLE)) {
+ /* xxx Send error */
+ return -EINVAL;
+ }
+
+ send_capability_reply(xid);
+
+ return 0;
+}
+
+static int
+recv_ofmp_resources_request(uint32_t xid, const void *msg UNUSED)
+{
+ send_resources_update(xid, true);
+ return 0;
+}
+
+static int
+recv_ofmp_config_request(uint32_t xid, const struct ofmp_header *ofmph)
+{
+ struct ofmp_config_request *ofmpcr;
+
+ if (htons(ofmph->header.header.length) != sizeof(*ofmpcr)) {
+ /* xxx Send error */
+ return -EINVAL;
+ }
+
+ ofmpcr = (struct ofmp_config_request *)ofmph;
+ if (ofmpcr->format != htonl(OFMPCOF_SIMPLE)) {
+ /* xxx Send error */
+ return -EINVAL;
+ }
+
+ send_config_update(xid, true);
+
+ return 0;
+}
+
+static int
+recv_ofmp_config_update(uint32_t xid, const struct ofmp_header *ofmph)
+{
+ struct ofmp_config_update *ofmpcu;
+ int data_len;
+
+ data_len = htons(ofmph->header.header.length) - sizeof(*ofmpcu);
+ if (data_len <= sizeof(*ofmpcu)) {
+ /* xxx Send error. */
+ return -EINVAL;
+ }
+
+ ofmpcu = (struct ofmp_config_update *)ofmph;
+ if (ofmpcu->format != htonl(OFMPCOF_SIMPLE)) {
+ /* xxx Send error */
+ return -EINVAL;
+ }
+
+ /* Check if the supplied cookie matches our current understanding of
+ * it. If they don't match, tell the controller and let it sort
+ * things out. */
+ if (cfg_lock(ofmpcu->cookie, 0)) {
+ /* xxx cfg_lock can fail for other reasons, such as being
+ * xxx locked... */
+ VLOG_WARN_RL(&rl, "config update failed due to bad cookie\n");
+ send_config_update_ack(xid, false);
+ return 0;
+ }
+
+ /* xxx We should probably do more sanity checking than this. */
+
+ cfg_write_data(ofmpcu->data, data_len);
+ cfg_unlock();
+
+ /* Send the ACK before running reconfigure, since our management
+ * connection settings may have changed. */
+ send_config_update_ack(xid, true);
+
+ reconfigure();
+
+
+ return 0;
+}
+
+static
+int recv_ofmp(uint32_t xid, struct ofmp_header *ofmph)
+{
+ /* xxx Should sanity-check for min/max length */
+ switch (ntohs(ofmph->type))
+ {
+ case OFMPT_CAPABILITY_REQUEST:
+ return recv_ofmp_capability_request(xid, ofmph);
+ case OFMPT_RESOURCES_REQUEST:
+ return recv_ofmp_resources_request(xid, ofmph);
+ case OFMPT_CONFIG_REQUEST:
+ return recv_ofmp_config_request(xid, ofmph);
+ case OFMPT_CONFIG_UPDATE:
+ return recv_ofmp_config_update(xid, ofmph);
+ default:
+ VLOG_WARN_RL(&rl, "unknown mgmt message: %d",
+ ntohs(ofmph->type));
+ return -EINVAL;
+ }
+}
+
+static int
+recv_nx_msg(uint32_t xid, const void *oh)
+{
+ const struct nicira_header *nh = oh;
+
+ switch (ntohl(nh->subtype)) {
+
+ case NXT_MGMT:
+ return recv_ofmp(xid, (struct ofmp_header *)oh);
+
+ default:
+ send_error_msg(xid, OFPET_BAD_REQUEST, OFPBRC_BAD_SUBTYPE,
+ oh, htons(nh->header.length));
+ return -EINVAL;
+ }
+}
+
+static int
+recv_vendor(uint32_t xid, const void *oh)
+{
+ const struct ofp_vendor_header *ovh = oh;
+
+ switch (ntohl(ovh->vendor))
+ {
+ case NX_VENDOR_ID:
+ return recv_nx_msg(xid, oh);
+
+ default:
+ VLOG_WARN_RL(&rl, "unknown vendor: 0x%x", ntohl(ovh->vendor));
+ send_error_msg(xid, OFPET_BAD_REQUEST, OFPBRC_BAD_VENDOR,
+ oh, ntohs(ovh->header.length));
+ return -EINVAL;
+ }
+}
+
+static int
+handle_msg(uint32_t xid, const void *msg, size_t length)
+{
+ int (*handler)(uint32_t, const void *);
+ struct ofp_header *oh;
+ size_t min_size;
+
+ COVERAGE_INC(mgmt_received);
+
+ /* Check encapsulated length. */
+ oh = (struct ofp_header *) msg;
+ if (ntohs(oh->length) > length) {
+ return -EINVAL;
+ }
+ assert(oh->version == OFP_VERSION);
+
+ /* Figure out how to handle it. */
+ switch (oh->type) {
+ case OFPT_ECHO_REQUEST:
+ min_size = sizeof(struct ofp_header);
+ handler = recv_echo_request;
+ break;
+ case OFPT_ECHO_REPLY:
+ return 0;
+ case OFPT_FEATURES_REQUEST:
+ min_size = sizeof(struct ofp_header);
+ handler = recv_features_request;
+ break;
+ case OFPT_SET_CONFIG:
+ min_size = sizeof(struct ofp_switch_config);
+ handler = recv_set_config;
+ break;
+ case OFPT_VENDOR:
+ min_size = sizeof(struct ofp_vendor_header);
+ handler = recv_vendor;
+ break;
+ default:
+ VLOG_WARN_RL(&rl, "unknown openflow type: %d", oh->type);
+ send_error_msg(xid, OFPET_BAD_REQUEST, OFPBRC_BAD_TYPE,
+ msg, length);
+ return -EINVAL;
+ }
+
+ /* Handle it. */
+ if (length < min_size) {
+ return -EFAULT;
+ }
+ return handler(xid, msg);
+}
+
+void
+mgmt_run(void)
+{
+ int i;
+
+ if (!mgmt_rconn) {
+ return;
+ }
+
+ rconn_run(mgmt_rconn);
+
+ /* Do some processing, but cap it at a reasonable amount so that
+ * other processing doesn't starve. */
+ for (i=0; i<50; i++) {
+ struct ofpbuf *buffer;
+ struct ofp_header *oh;
+
+ buffer = rconn_recv(mgmt_rconn);
+ if (!buffer) {
+ break;
+ }
+
+ if (buffer->size >= sizeof *oh) {
+ oh = buffer->data;
+ handle_msg(oh->xid, buffer->data, buffer->size);
+ ofpbuf_delete(buffer);
+ } else {
+ VLOG_WARN_RL(&rl, "received too-short OpenFlow message");
+ }
+ }
+}
+
+void
+mgmt_wait(void)
+{
+ if (!mgmt_rconn) {
+ return;
+ }
+
+ rconn_run_wait(mgmt_rconn);
+ rconn_recv_wait(mgmt_rconn);
+}
+
+static uint64_t
+pick_fallback_mgmt_id(void)
+{
+ uint8_t ea[ETH_ADDR_LEN];
+ eth_addr_random(ea);
+ ea[0] = 0x00; /* Set Nicira OUI. */
+ ea[1] = 0x23;
+ ea[2] = 0x20;
+ return eth_addr_to_uint64(ea);
+}
diff --git a/vswitchd/mgmt.h b/vswitchd/mgmt.h
new file mode 100644
index 00000000..6db66c69
--- /dev/null
+++ b/vswitchd/mgmt.h
@@ -0,0 +1,36 @@
+/* Copyright (c) 2009 Nicira Networks
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * In addition, as a special exception, Nicira Networks gives permission
+ * to link the code of its release of vswitchd with the OpenSSL project's
+ * "OpenSSL" library (or with modified versions of it that use the same
+ * license as the "OpenSSL" library), and distribute the linked
+ * executables. You must obey the GNU General Public License in all
+ * respects for all of the code used other than "OpenSSL". If you modify
+ * this file, you may extend this exception to your version of the file,
+ * but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version.
+ */
+
+#ifndef VSWITCHD_MGMT_H
+#define VSWITCHD_MGMT_H 1
+
+void mgmt_init(void);
+void mgmt_reconfigure(void);
+void mgmt_run(void);
+void mgmt_wait(void);
+uint64_t mgmt_get_mgmt_id(void);
+
+#endif /* mgmt.h */
diff --git a/vswitchd/ovs-brcompatd.8.in b/vswitchd/ovs-brcompatd.8.in
new file mode 100644
index 00000000..ebd67028
--- /dev/null
+++ b/vswitchd/ovs-brcompatd.8.in
@@ -0,0 +1,49 @@
+.TH ovs\-brcompatd 8 "March 2009" "Open vSwitch" "Open vSwitch Manual"
+.ds PN ovs\-brcompatd
+.
+.SH NAME
+ovs\-brcompatd \- Bridge compatibility front-end for ovs\-vswitchd
+.
+.SH SYNOPSIS
+.B ovs\-brcompatd
+[\fIoptions\fR] \fIconfig\fR
+.
+.SH DESCRIPTION
+A daemon that provides a legacy bridge front-end for \fBovs\-vswitchd\fR. It
+does this by listening for bridge ioctl commands (e.g., those generated by
+the \fBbrctl\fR program) to add or remove datapaths and the interfaces
+that attach to them. It modifies \fIconfig\fR and forces
+\fBovs\-vswitchd\fR to reload its configuration file.
+.PP
+.SH OPTIONS
+.IP "\fB--reload-command=\fIcommand\fR"
+Sets the command that \fBovs\-brcompatd\fR runs to force \fBovs\-vswitchd\fR to
+reload its configuration file to \fIcommand\fR. The command is run in
+a subshell, so it may contain arbitrary shell metacharacters, etc.
+The \fB--help\fR option displays the default reload command.
+.TP
+\fB--prune-timeout=\fIsecs\fR
+.
+Sets the maximum time between pruning port entries to \fIsecs\fR seconds.
+Pruning ports is the process of removing port entries from \fIconfig\fR
+that no longer exist. If \fIsecs\fR is zero, then entries are never
+pruned. The default prune timeout is 5 seconds.
+.TP
+\fB--lock-timeout=\fImsecs\fR
+.
+Sets the maximum time to wait for \fIconfig\fR to become unlocked to
+\fImsecs\fR milliseconds. The default lock timeout is 500 milliseconds.
+.
+.so lib/daemon.man
+.so lib/vlog.man
+.so lib/common.man
+.so lib/leak-checker.man
+.
+.SH NOTES
+\fBovs\-brcompatd\fR requires the \fBbrcompat_mod.ko\fR kernel module to be
+loaded.
+.SH "SEE ALSO"
+.BR ovs\-appctl (8),
+.BR ovs\-vswitchd (8),
+.BR ovs\-vswitchd.conf (5),
+\fBINSTALL\fR in the Open vSwitch distribution.
diff --git a/vswitchd/ovs-brcompatd.c b/vswitchd/ovs-brcompatd.c
new file mode 100644
index 00000000..93d9469b
--- /dev/null
+++ b/vswitchd/ovs-brcompatd.c
@@ -0,0 +1,766 @@
+/* Copyright (c) 2008, 2009 Nicira Networks
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include <config.h>
+
+#include <assert.h>
+#include <errno.h>
+#include <getopt.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <net/if.h>
+#include <linux/genetlink.h>
+#include <linux/rtnetlink.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+#include "cfg.h"
+#include "command-line.h"
+#include "coverage.h"
+#include "daemon.h"
+#include "dirs.h"
+#include "dpif.h"
+#include "fatal-signal.h"
+#include "fault.h"
+#include "leak-checker.h"
+#include "netdev.h"
+#include "netlink.h"
+#include "ofpbuf.h"
+#include "openvswitch/brcompat-netlink.h"
+#include "poll-loop.h"
+#include "process.h"
+#include "signals.h"
+#include "svec.h"
+#include "timeval.h"
+#include "unixctl.h"
+#include "util.h"
+
+#include "vlog.h"
+#define THIS_MODULE VLM_brcompatd
+
+
+/* xxx Just hangs if datapath is rmmod/insmod. Learn to reconnect? */
+
+/* Actions to modify bridge compatibility configuration. */
+enum bmc_action {
+ BMC_ADD_DP,
+ BMC_DEL_DP,
+ BMC_ADD_PORT,
+ BMC_DEL_PORT
+};
+
+static void parse_options(int argc, char *argv[]);
+static void usage(void) NO_RETURN;
+
+static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 60);
+
+/* Maximum number of milliseconds to wait for the config file to be
+ * unlocked. If set to zero, no waiting will occur. */
+static int lock_timeout = 500;
+
+/* Maximum number of milliseconds to wait before pruning port entries that
+ * no longer exist. If set to zero, ports are never pruned. */
+static int prune_timeout = 5000;
+
+/* Config file shared with ovs-vswitchd (usually ovs-vswitchd.conf). */
+static char *config_file;
+
+/* Command to run (via system()) to reload the ovs-vswitchd configuration
+ * file. */
+static char *reload_command;
+
+/* Netlink socket to listen for interface changes. */
+static struct nl_sock *rtnl_sock;
+
+/* Netlink socket to bridge compatibility kernel module. */
+static struct nl_sock *brc_sock;
+
+/* The Generic Netlink family number used for bridge compatibility. */
+static int brc_family;
+
+static const struct nl_policy brc_multicast_policy[] = {
+ [BRC_GENL_A_MC_GROUP] = {.type = NL_A_U32 }
+};
+
+static const struct nl_policy rtnlgrp_link_policy[] = {
+ [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
+ [IFLA_MASTER] = { .type = NL_A_U32, .optional = true },
+};
+
+static int
+lookup_brc_multicast_group(int *multicast_group)
+{
+ struct nl_sock *sock;
+ struct ofpbuf request, *reply;
+ struct nlattr *attrs[ARRAY_SIZE(brc_multicast_policy)];
+ int retval;
+
+ retval = nl_sock_create(NETLINK_GENERIC, 0, 0, 0, &sock);
+ if (retval) {
+ return retval;
+ }
+ ofpbuf_init(&request, 0);
+ nl_msg_put_genlmsghdr(&request, sock, 0, brc_family,
+ NLM_F_REQUEST, BRC_GENL_C_QUERY_MC, 1);
+ retval = nl_sock_transact(sock, &request, &reply);
+ ofpbuf_uninit(&request);
+ if (retval) {
+ nl_sock_destroy(sock);
+ return retval;
+ }
+ if (!nl_policy_parse(reply, NLMSG_HDRLEN + GENL_HDRLEN,
+ brc_multicast_policy, attrs,
+ ARRAY_SIZE(brc_multicast_policy))) {
+ nl_sock_destroy(sock);
+ ofpbuf_delete(reply);
+ return EPROTO;
+ }
+ *multicast_group = nl_attr_get_u32(attrs[BRC_GENL_A_MC_GROUP]);
+ nl_sock_destroy(sock);
+ ofpbuf_delete(reply);
+
+ return 0;
+}
+
+/* Opens a socket for brcompat notifications. Returns 0 if successful,
+ * otherwise a positive errno value. */
+static int
+brc_open(struct nl_sock **sock)
+{
+ int multicast_group = 0;
+ int retval;
+
+ retval = nl_lookup_genl_family(BRC_GENL_FAMILY_NAME, &brc_family);
+ if (retval) {
+ return retval;
+ }
+
+ retval = lookup_brc_multicast_group(&multicast_group);
+ if (retval) {
+ return retval;
+ }
+
+ retval = nl_sock_create(NETLINK_GENERIC, multicast_group, 0, 0, sock);
+ if (retval) {
+ return retval;
+ }
+
+ return 0;
+}
+
+static const struct nl_policy brc_dp_policy[] = {
+ [BRC_GENL_A_DP_NAME] = { .type = NL_A_STRING },
+};
+
+static bool
+bridge_exists(const char *name)
+{
+ return cfg_has_section("bridge.%s", name);
+}
+
+static int
+rewrite_and_reload_config(void)
+{
+ if (cfg_is_dirty()) {
+ int error1 = cfg_write();
+ int error2 = cfg_read();
+ long long int reload_start = time_msec();
+ int error3 = system(reload_command);
+ long long int elapsed = time_msec() - reload_start;
+ COVERAGE_INC(brcompatd_reload);
+ if (elapsed > 0) {
+ VLOG_INFO("reload command executed in %lld ms", elapsed);
+ }
+ if (error3 == -1) {
+ VLOG_ERR("failed to execute reload command: %s", strerror(errno));
+ } else if (error3 != 0) {
+ char *msg = process_status_msg(error3);
+ VLOG_ERR("reload command exited with error (%s)", msg);
+ free(msg);
+ }
+ return error1 ? error1 : error2 ? error2 : error3 ? ECHILD : 0;
+ }
+ return 0;
+}
+
+/* Go through the configuration file and remove any ports that no longer
+ * exist associated with a bridge. */
+static void
+prune_ports(void)
+{
+ int i, j;
+ int error;
+ struct svec bridges, delete;
+
+ if (cfg_lock(NULL, 0)) {
+ /* Couldn't lock config file. */
+ return;
+ }
+
+ svec_init(&bridges);
+ svec_init(&delete);
+ cfg_get_subsections(&bridges, "bridge");
+ for (i=0; i<bridges.n; i++) {
+ const char *br_name = bridges.names[i];
+ struct svec ports, ifaces;
+
+ svec_init(&ports);
+
+ /* Get all the interfaces for the given bridge, breaking bonded
+ * interfaces down into their constituent parts. */
+ svec_init(&ifaces);
+ cfg_get_all_keys(&ports, "bridge.%s.port", br_name);
+ for (j=0; j<ports.n; j++) {
+ const char *port_name = ports.names[j];
+ if (cfg_has_section("bonding.%s", port_name)) {
+ struct svec slaves;
+ svec_init(&slaves);
+ cfg_get_all_keys(&slaves, "bonding.%s.slave", port_name);
+ svec_append(&ifaces, &slaves);
+ svec_destroy(&slaves);
+ } else {
+ svec_add(&ifaces, port_name);
+ }
+ }
+ svec_destroy(&ports);
+
+ /* Check that the interfaces exist. */
+ for (j = 0; j < ifaces.n; j++) {
+ const char *iface_name = ifaces.names[j];
+ enum netdev_flags flags;
+
+ /* The local port and internal ports are created and destroyed by
+ * ovs-vswitchd itself, so don't bother checking for them at all.
+ * In practice, they might not exist if ovs-vswitchd hasn't
+ * finished reloading since the configuration file was updated. */
+ if (!strcmp(iface_name, br_name)
+ || cfg_get_bool(0, "iface.%s.internal", iface_name)) {
+ continue;
+ }
+
+ error = netdev_nodev_get_flags(iface_name, &flags);
+ if (error == ENODEV) {
+ VLOG_DBG_RL(&rl, "removing dead interface %s from %s",
+ iface_name, br_name);
+ svec_add(&delete, iface_name);
+ } else if (error) {
+ VLOG_DBG_RL(&rl, "unknown error %d on interface %s from %s",
+ error, iface_name, br_name);
+ }
+ }
+ svec_destroy(&ifaces);
+ }
+ svec_destroy(&bridges);
+
+ if (delete.n) {
+ size_t i;
+
+ for (i = 0; i < delete.n; i++) {
+ cfg_del_match("bridge.*.port=%s", delete.names[i]);
+ cfg_del_match("bonding.*.slave=%s", delete.names[i]);
+ }
+ rewrite_and_reload_config();
+ cfg_unlock();
+ } else {
+ cfg_unlock();
+ }
+ svec_destroy(&delete);
+}
+
+
+/* Checks whether a network device named 'name' exists and returns true if so,
+ * false otherwise.
+ *
+ * XXX it is possible that this doesn't entirely accomplish what we want in
+ * context, since ovs-vswitchd.conf may cause vswitchd to create or destroy
+ * network devices based on iface.*.internal settings.
+ *
+ * XXX may want to move this to lib/netdev. */
+static bool
+netdev_exists(const char *name)
+{
+ struct stat s;
+ char *filename;
+ int error;
+
+ filename = xasprintf("/sys/class/net/%s", name);
+ error = stat(filename, &s);
+ free(filename);
+ return !error;
+}
+
+static int
+add_bridge(const char *br_name)
+{
+ if (bridge_exists(br_name)) {
+ VLOG_WARN("addbr %s: bridge %s exists", br_name, br_name);
+ return EEXIST;
+ } else if (netdev_exists(br_name)) {
+ if (cfg_get_bool(0, "iface.%s.fake-bridge", br_name)) {
+ VLOG_WARN("addbr %s: %s exists as a fake bridge",
+ br_name, br_name);
+ return 0;
+ } else {
+ VLOG_WARN("addbr %s: cannot create bridge %s because a network "
+ "device named %s already exists",
+ br_name, br_name, br_name);
+ return EEXIST;
+ }
+ }
+
+ cfg_add_entry("bridge.%s.port=%s", br_name, br_name);
+ VLOG_INFO("addbr %s: success", br_name);
+
+ return 0;
+}
+
+static int
+del_bridge(const char *br_name)
+{
+ if (!bridge_exists(br_name)) {
+ VLOG_WARN("delbr %s: no bridge named %s", br_name, br_name);
+ return ENXIO;
+ }
+
+ cfg_del_section("bridge.%s", br_name);
+ VLOG_INFO("delbr %s: success", br_name);
+
+ return 0;
+}
+
+static int
+parse_command(struct ofpbuf *buffer, uint32_t *seq, const char **br_name,
+ const char **port_name)
+{
+ static const struct nl_policy policy[] = {
+ [BRC_GENL_A_DP_NAME] = { .type = NL_A_STRING },
+ [BRC_GENL_A_PORT_NAME] = { .type = NL_A_STRING, .optional = true },
+ };
+ struct nlattr *attrs[ARRAY_SIZE(policy)];
+
+ if (!nl_policy_parse(buffer, NLMSG_HDRLEN + GENL_HDRLEN, policy,
+ attrs, ARRAY_SIZE(policy))
+ || (port_name && !attrs[BRC_GENL_A_PORT_NAME])) {
+ return EINVAL;
+ }
+
+ *seq = ((struct nlmsghdr *) buffer->data)->nlmsg_seq;
+ *br_name = nl_attr_get_string(attrs[BRC_GENL_A_DP_NAME]);
+ if (port_name) {
+ *port_name = nl_attr_get_string(attrs[BRC_GENL_A_PORT_NAME]);
+ }
+ return 0;
+}
+
+static void
+send_reply(uint32_t seq, int error)
+{
+ struct ofpbuf msg;
+ int retval;
+
+ /* Compose reply. */
+ ofpbuf_init(&msg, 0);
+ nl_msg_put_genlmsghdr(&msg, brc_sock, 32, brc_family, NLM_F_REQUEST,
+ BRC_GENL_C_DP_RESULT, 1);
+ ((struct nlmsghdr *) msg.data)->nlmsg_seq = seq;
+ nl_msg_put_u32(&msg, BRC_GENL_A_ERR_CODE, error);
+
+ /* Send reply. */
+ retval = nl_sock_send(brc_sock, &msg, false);
+ if (retval) {
+ VLOG_WARN_RL(&rl, "replying to brcompat request: %s",
+ strerror(retval));
+ }
+ ofpbuf_uninit(&msg);
+}
+
+static int
+handle_bridge_cmd(struct ofpbuf *buffer, bool add)
+{
+ const char *br_name;
+ uint32_t seq;
+ int error;
+
+ error = parse_command(buffer, &seq, &br_name, NULL);
+ if (!error) {
+ error = add ? add_bridge(br_name) : del_bridge(br_name);
+ if (!error) {
+ error = rewrite_and_reload_config();
+ }
+ send_reply(seq, error);
+ }
+ return error;
+}
+
+static const struct nl_policy brc_port_policy[] = {
+ [BRC_GENL_A_DP_NAME] = { .type = NL_A_STRING },
+ [BRC_GENL_A_PORT_NAME] = { .type = NL_A_STRING },
+};
+
+static void
+del_port(const char *br_name, const char *port_name)
+{
+ cfg_del_entry("bridge.%s.port=%s", br_name, port_name);
+ cfg_del_match("bonding.*.slave=%s", port_name);
+ cfg_del_match("vlan.%s.*", port_name);
+}
+
+static int
+handle_port_cmd(struct ofpbuf *buffer, bool add)
+{
+ const char *cmd_name = add ? "add-if" : "del-if";
+ const char *br_name, *port_name;
+ uint32_t seq;
+ int error;
+
+ error = parse_command(buffer, &seq, &br_name, &port_name);
+ if (!error) {
+ if (!bridge_exists(br_name)) {
+ VLOG_WARN("%s %s %s: no bridge named %s",
+ cmd_name, br_name, port_name, br_name);
+ error = EINVAL;
+ } else if (!netdev_exists(port_name)) {
+ VLOG_WARN("%s %s %s: no network device named %s",
+ cmd_name, br_name, port_name, port_name);
+ error = EINVAL;
+ } else {
+ if (add) {
+ cfg_add_entry("bridge.%s.port=%s", br_name, port_name);
+ } else {
+ del_port(br_name, port_name);
+ }
+ VLOG_INFO("%s %s %s: success", cmd_name, br_name, port_name);
+ error = rewrite_and_reload_config();
+ }
+ send_reply(seq, error);
+ }
+
+ return error;
+}
+
+static int
+brc_recv_update(void)
+{
+ int retval;
+ struct ofpbuf *buffer;
+ struct genlmsghdr *genlmsghdr;
+
+
+ buffer = NULL;
+ do {
+ ofpbuf_delete(buffer);
+ retval = nl_sock_recv(brc_sock, &buffer, false);
+ } while (retval == ENOBUFS
+ || (!retval
+ && (nl_msg_nlmsgerr(buffer, NULL)
+ || nl_msg_nlmsghdr(buffer)->nlmsg_type == NLMSG_DONE)));
+ if (retval) {
+ if (retval != EAGAIN) {
+ VLOG_WARN_RL(&rl, "brc_recv_update: %s", strerror(retval));
+ }
+ return retval;
+ }
+
+ genlmsghdr = nl_msg_genlmsghdr(buffer);
+ if (!genlmsghdr) {
+ VLOG_WARN_RL(&rl, "received packet too short for generic NetLink");
+ goto error;
+ }
+
+ if (nl_msg_nlmsghdr(buffer)->nlmsg_type != brc_family) {
+ VLOG_DBG_RL(&rl, "received type (%"PRIu16") != brcompat family (%d)",
+ nl_msg_nlmsghdr(buffer)->nlmsg_type, brc_family);
+ goto error;
+ }
+
+ if (cfg_lock(NULL, lock_timeout)) {
+ /* Couldn't lock config file. */
+ retval = EAGAIN;
+ goto error;
+ }
+
+ switch (genlmsghdr->cmd) {
+ case BRC_GENL_C_DP_ADD:
+ retval = handle_bridge_cmd(buffer, true);
+ break;
+
+ case BRC_GENL_C_DP_DEL:
+ retval = handle_bridge_cmd(buffer, false);
+ break;
+
+ case BRC_GENL_C_PORT_ADD:
+ retval = handle_port_cmd(buffer, true);
+ break;
+
+ case BRC_GENL_C_PORT_DEL:
+ retval = handle_port_cmd(buffer, false);
+ break;
+
+ default:
+ retval = EPROTO;
+ }
+
+ cfg_unlock();
+
+error:
+ ofpbuf_delete(buffer);
+ return retval;
+}
+
+/* Check for interface configuration changes announced through RTNL. */
+static void
+rtnl_recv_update(void)
+{
+ struct ofpbuf *buf;
+
+ int error = nl_sock_recv(rtnl_sock, &buf, false);
+ if (error == EAGAIN) {
+ /* Nothing to do. */
+ } else if (error == ENOBUFS) {
+ VLOG_WARN_RL(&rl, "network monitor socket overflowed");
+ } else if (error) {
+ VLOG_WARN_RL(&rl, "error on network monitor socket: %s",
+ strerror(error));
+ } else {
+ struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
+ struct nlmsghdr *nlh;
+ struct ifinfomsg *iim;
+
+ nlh = ofpbuf_at(buf, 0, NLMSG_HDRLEN);
+ iim = ofpbuf_at(buf, NLMSG_HDRLEN, sizeof *iim);
+ if (!iim) {
+ VLOG_WARN_RL(&rl, "received bad rtnl message (no ifinfomsg)");
+ ofpbuf_delete(buf);
+ return;
+ }
+
+ if (!nl_policy_parse(buf, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
+ rtnlgrp_link_policy,
+ attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
+ VLOG_WARN_RL(&rl,"received bad rtnl message (policy)");
+ ofpbuf_delete(buf);
+ return;
+ }
+ if (nlh->nlmsg_type == RTM_DELLINK && attrs[IFLA_MASTER]) {
+ const char *port_name = nl_attr_get_string(attrs[IFLA_IFNAME]);
+ char br_name[IFNAMSIZ];
+ uint32_t br_idx = nl_attr_get_u32(attrs[IFLA_MASTER]);
+ struct svec ports;
+
+ if (!if_indextoname(br_idx, br_name)) {
+ ofpbuf_delete(buf);
+ return;
+ }
+
+ if (cfg_lock(NULL, lock_timeout)) {
+ /* Couldn't lock config file. */
+ /* xxx this should try again and print error msg. */
+ ofpbuf_delete(buf);
+ return;
+ }
+
+ svec_init(&ports);
+ cfg_get_all_keys(&ports, "bridge.%s.port", br_name);
+ svec_sort(&ports);
+ if (svec_contains(&ports, port_name)) {
+ del_port(br_name, port_name);
+ rewrite_and_reload_config();
+ }
+ cfg_unlock();
+ }
+ ofpbuf_delete(buf);
+ }
+}
+
+int
+main(int argc, char *argv[])
+{
+ struct unixctl_server *unixctl;
+ int retval;
+
+ set_program_name(argv[0]);
+ register_fault_handlers();
+ time_init();
+ vlog_init();
+ parse_options(argc, argv);
+ signal(SIGPIPE, SIG_IGN);
+ process_init();
+
+ die_if_already_running();
+ daemonize();
+
+ retval = unixctl_server_create(NULL, &unixctl);
+ if (retval) {
+ ovs_fatal(retval, "could not listen for vlog connections");
+ }
+
+ if (brc_open(&brc_sock)) {
+ ovs_fatal(0, "could not open brcompat socket. Check "
+ "\"brcompat\" kernel module.");
+ }
+
+ if (prune_timeout) {
+ if (nl_sock_create(NETLINK_ROUTE, RTNLGRP_LINK, 0, 0, &rtnl_sock)) {
+ ovs_fatal(0, "could not create rtnetlink socket");
+ }
+ }
+
+ cfg_read();
+
+ for (;;) {
+ unixctl_server_run(unixctl);
+ brc_recv_update();
+
+ /* If 'prune_timeout' is non-zero, we actively prune from the
+ * config file any 'bridge.<br_name>.port' entries that are no
+ * longer valid. We use two methods:
+ *
+ * 1) The kernel explicitly notifies us of removed ports
+ * through the RTNL messages.
+ *
+ * 2) We periodically check all ports associated with bridges
+ * to see if they no longer exist.
+ */
+ if (prune_timeout) {
+ rtnl_recv_update();
+ prune_ports();
+
+ nl_sock_wait(rtnl_sock, POLLIN);
+ poll_timer_wait(prune_timeout);
+ }
+
+ nl_sock_wait(brc_sock, POLLIN);
+ unixctl_server_wait(unixctl);
+ poll_block();
+ }
+
+ return 0;
+}
+
+static void
+parse_options(int argc, char *argv[])
+{
+ enum {
+ OPT_LOCK_TIMEOUT = UCHAR_MAX + 1,
+ OPT_PRUNE_TIMEOUT,
+ OPT_RELOAD_COMMAND,
+ VLOG_OPTION_ENUMS,
+ LEAK_CHECKER_OPTION_ENUMS
+ };
+ static struct option long_options[] = {
+ {"help", no_argument, 0, 'h'},
+ {"version", no_argument, 0, 'V'},
+ {"lock-timeout", required_argument, 0, OPT_LOCK_TIMEOUT},
+ {"prune-timeout", required_argument, 0, OPT_PRUNE_TIMEOUT},
+ {"reload-command", required_argument, 0, OPT_RELOAD_COMMAND},
+ DAEMON_LONG_OPTIONS,
+ VLOG_LONG_OPTIONS,
+ LEAK_CHECKER_LONG_OPTIONS,
+ {0, 0, 0, 0},
+ };
+ char *short_options = long_options_to_short_options(long_options);
+ int error;
+
+ reload_command = xasprintf("%s/ovs-appctl -t "
+ "%s/ovs-vswitchd.`cat %s/ovs-vswitchd.pid`.ctl "
+ "-e vswitchd/reload 2>&1 "
+ "| /usr/bin/logger -t brcompatd-reload",
+ ovs_bindir, ovs_rundir, ovs_rundir);
+ for (;;) {
+ int c;
+
+ c = getopt_long(argc, argv, short_options, long_options, NULL);
+ if (c == -1) {
+ break;
+ }
+
+ switch (c) {
+ case 'H':
+ case 'h':
+ usage();
+
+ case 'V':
+ OVS_PRINT_VERSION(0, 0);
+ exit(EXIT_SUCCESS);
+
+ case OPT_LOCK_TIMEOUT:
+ lock_timeout = atoi(optarg);
+ break;
+
+ case OPT_PRUNE_TIMEOUT:
+ prune_timeout = atoi(optarg) * 1000;
+ break;
+
+ case OPT_RELOAD_COMMAND:
+ reload_command = optarg;
+ break;
+
+ VLOG_OPTION_HANDLERS
+ DAEMON_OPTION_HANDLERS
+ LEAK_CHECKER_OPTION_HANDLERS
+
+ case '?':
+ exit(EXIT_FAILURE);
+
+ default:
+ abort();
+ }
+ }
+ free(short_options);
+
+ argc -= optind;
+ argv += optind;
+
+ if (argc != 1) {
+ ovs_fatal(0, "exactly one non-option argument required; "
+ "use --help for usage");
+ }
+
+ config_file = argv[0];
+ error = cfg_set_file(config_file);
+ if (error) {
+ ovs_fatal(error, "failed to add configuration file \"%s\"",
+ config_file);
+ }
+}
+
+static void
+usage(void)
+{
+ printf("%s: bridge compatibility front-end for ovs-vswitchd\n"
+ "usage: %s [OPTIONS] CONFIG\n"
+ "CONFIG is the configuration file used by ovs-vswitchd.\n",
+ program_name, program_name);
+ printf("\nConfiguration options:\n"
+ " --reload-command=COMMAND shell command to reload ovs-vswitchd\n"
+ " --prune-timeout=SECS wait at most SECS before pruning ports\n"
+ " --lock-timeout=MSECS wait at most MSECS for CONFIG to unlock\n"
+ );
+ daemon_usage();
+ vlog_usage();
+ printf("\nOther options:\n"
+ " -h, --help display this help message\n"
+ " -V, --version display version information\n");
+ leak_checker_usage();
+ printf("\nThe default reload command is:\n%s\n", reload_command);
+ exit(EXIT_SUCCESS);
+}
diff --git a/vswitchd/ovs-vswitchd.8.in b/vswitchd/ovs-vswitchd.8.in
new file mode 100644
index 00000000..28e55ba3
--- /dev/null
+++ b/vswitchd/ovs-vswitchd.8.in
@@ -0,0 +1,87 @@
+.TH ovs\-vswitchd 8 "March 2009" "Open vSwitch" "OpenVSwitch Manual"
+.ds PN ovs\-vswitchd
+.
+.SH NAME
+ovs\-vswitchd \- virtual switch daemon
+.
+.SH SYNOPSIS
+.B ovs\-vswitchd
+\fIconfig\fR
+.
+.SH DESCRIPTION
+A daemon that manages and controls any number of virtual switches on
+the local machine.
+.PP
+The mandatory \fIconfig\fR argument specifies a configuration file.
+For a description of \fBovs\-vswitchd\fR configuration syntax, see
+\fBovs\-vswitchd.conf\fR(5).
+.PP
+At startup or upon receipt of a \fBSIGHUP\fR signal, \fBovs\-vswitchd\fR
+reads the configuration file. It sets up Open vSwitch datapaths and then
+operates switching across each bridge described in its configuration
+files. If a logfile was specified on the command line it will also
+be opened or reopened.
+.PP
+\fBovs\-vswitchd\fR virtual switches may be configured with any of the
+following features:
+.
+.IP \(bu
+L2 switching with MAC learning.
+.
+.IP \(bu
+NIC bonding with automatic fail-over and source MAC-based TX load
+balancing ("SLB").
+.
+.IP \(bu
+802.1Q VLAN support.
+.
+.IP \(bu
+Port mirroring, with optional VLAN tagging.
+.
+.IP \(bu
+NetFlow v5 flow logging.
+.
+.IP \(bu
+Connectivity to an external OpenFlow controller, such as NOX.
+.
+.PP
+Only a single instance of \fBovs\-vswitchd\fR is intended to run at a time.
+A single \fBovs\-vswitchd\fR can manage any number of virtual switches, up
+to the maximum number of supported Open vSwitch datapaths.
+.PP
+\fBovs\-vswitchd\fR does all the necessary management of OpenVSwitch datapaths
+itself. Thus, external tools, such \fBovs\-dpctl\fR(8), are not needed for
+managing datapaths in conjunction with \fBovs\-vswitchd\fR, and their use
+to modify datapaths when \fBovs\-vswitchd\fR is running can interfere with
+its operation. (\fBovs\-dpctl\fR may still be useful for diagnostics.)
+.PP
+An Open vSwitch datapath kernel module must be loaded for \fBovs\-vswitchd\fR
+to be useful. Please refer to the \fBINSTALL\fR file included in the
+Open vSwitch distribution for instructions on how to build and load
+the Open vSwitch kernel module.
+.PP
+.SH OPTIONS
+.IP "\fB--fake-proc-net\fR"
+Causes \fBovs\-vswitchd\fR to simulate some files in \fB/proc/net/vlan\fR
+and \fB/proc/net/bonding\fR that some legacy software expects to
+exist. This option should only be used if such legacy software is
+actually in use. It requires the \fBbrcompat_mod.ko\fR kernel module
+to be loaded.
+.
+.so lib/daemon.man
+.so lib/vlog.man
+.so lib/common.man
+.so lib/leak-checker.man
+.
+.SH "BUGS"
+.
+Only Open vSwitch kernel-based datapaths are currently supported. In the
+future, this restriction may be lifted.
+.PP
+Only Linux 2.6.\fIx\fR is currently supported.
+.
+.SH "SEE ALSO"
+.BR ovs\-appctl (8),
+.BR ovs\-vswitchd.conf (5),
+.BR ovs\-brcompatd (8),
+\fBINSTALL\fR in the Open vSwitch distribution.
diff --git a/vswitchd/ovs-vswitchd.c b/vswitchd/ovs-vswitchd.c
new file mode 100644
index 00000000..9528ec5f
--- /dev/null
+++ b/vswitchd/ovs-vswitchd.c
@@ -0,0 +1,255 @@
+/* Copyright (c) 2008, 2009 Nicira Networks
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * In addition, as a special exception, Nicira Networks gives permission
+ * to link the code of its release of vswitchd with the OpenSSL project's
+ * "OpenSSL" library (or with modified versions of it that use the same
+ * license as the "OpenSSL" library), and distribute the linked
+ * executables. You must obey the GNU General Public License in all
+ * respects for all of the code used other than "OpenSSL". If you modify
+ * this file, you may extend this exception to your version of the file,
+ * but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version.
+ */
+
+#include <config.h>
+
+#include <assert.h>
+#include <errno.h>
+#include <getopt.h>
+#include <limits.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "bridge.h"
+#include "cfg.h"
+#include "command-line.h"
+#include "compiler.h"
+#include "daemon.h"
+#include "fault.h"
+#include "leak-checker.h"
+#include "mgmt.h"
+#include "ovs-vswitchd.h"
+#include "poll-loop.h"
+#include "port.h"
+#include "proc-net-compat.h"
+#include "process.h"
+#include "signals.h"
+#include "svec.h"
+#include "timeval.h"
+#include "unixctl.h"
+#include "util.h"
+#include "vconn-ssl.h"
+#include "vconn.h"
+
+#include "vlog.h"
+#define THIS_MODULE VLM_vswitchd
+
+static void parse_options(int argc, char *argv[]);
+static void usage(void) NO_RETURN;
+static void reload(struct unixctl_conn *, const char *args);
+
+static bool need_reconfigure;
+static struct unixctl_conn **conns;
+static size_t n_conns;
+
+int
+main(int argc, char *argv[])
+{
+ struct unixctl_server *unixctl;
+ struct signal *sighup;
+ int retval;
+
+ set_program_name(argv[0]);
+ register_fault_handlers();
+ time_init();
+ vlog_init();
+ parse_options(argc, argv);
+ signal(SIGPIPE, SIG_IGN);
+ sighup = signal_register(SIGHUP);
+ process_init();
+
+ die_if_already_running();
+ daemonize();
+
+ retval = unixctl_server_create(NULL, &unixctl);
+ if (retval) {
+ ovs_fatal(retval, "could not listen for control connections");
+ }
+ unixctl_command_register("vswitchd/reload", reload);
+
+ cfg_read();
+ mgmt_init();
+ bridge_init();
+ port_init();
+ mgmt_reconfigure();
+
+ need_reconfigure = false;
+ for (;;) {
+ if (need_reconfigure || signal_poll(sighup)) {
+ need_reconfigure = false;
+ vlog_reopen_log_file();
+ reconfigure();
+ }
+ mgmt_run();
+ if (bridge_run()) {
+ need_reconfigure = true;
+ }
+ unixctl_server_run(unixctl);
+
+ if (need_reconfigure) {
+ poll_immediate_wake();
+ }
+ signal_wait(sighup);
+ mgmt_wait();
+ bridge_wait();
+ unixctl_server_wait(unixctl);
+ poll_block();
+ }
+
+ return 0;
+}
+
+static void
+reload(struct unixctl_conn *conn, const char *args UNUSED)
+{
+ need_reconfigure = true;
+ conns = xrealloc(conns, sizeof *conns * (n_conns + 1));
+ conns[n_conns++] = conn;
+}
+
+void
+reconfigure(void)
+{
+ size_t i;
+
+ cfg_read();
+ bridge_reconfigure();
+ mgmt_reconfigure();
+ port_reconfigure();
+
+ for (i = 0; i < n_conns; i++) {
+ unixctl_command_reply(conns[i], 202, NULL);
+ }
+ free(conns);
+ conns = NULL;
+ n_conns = 0;
+}
+
+static void
+parse_options(int argc, char *argv[])
+{
+ enum {
+ OPT_PEER_CA_CERT = UCHAR_MAX + 1,
+ OPT_FAKE_PROC_NET,
+ VLOG_OPTION_ENUMS,
+ LEAK_CHECKER_OPTION_ENUMS
+ };
+ static struct option long_options[] = {
+ {"help", no_argument, 0, 'h'},
+ {"version", no_argument, 0, 'V'},
+ {"fake-proc-net", no_argument, 0, OPT_FAKE_PROC_NET},
+ DAEMON_LONG_OPTIONS,
+ VLOG_LONG_OPTIONS,
+ LEAK_CHECKER_LONG_OPTIONS,
+#ifdef HAVE_OPENSSL
+ VCONN_SSL_LONG_OPTIONS
+ {"peer-ca-cert", required_argument, 0, OPT_PEER_CA_CERT},
+#endif
+ {0, 0, 0, 0},
+ };
+ char *short_options = long_options_to_short_options(long_options);
+ const char *config_file;
+ int error;
+
+ for (;;) {
+ int c;
+
+ c = getopt_long(argc, argv, short_options, long_options, NULL);
+ if (c == -1) {
+ break;
+ }
+
+ switch (c) {
+ case 'H':
+ case 'h':
+ usage();
+
+ case 'V':
+ OVS_PRINT_VERSION(OFP_VERSION, OFP_VERSION);
+ exit(EXIT_SUCCESS);
+
+ case OPT_FAKE_PROC_NET:
+ error = proc_net_compat_init();
+ if (error) {
+ ovs_fatal(error, "failed to initialize /proc/net "
+ "compatibility");
+ }
+ break;
+
+ VLOG_OPTION_HANDLERS
+ DAEMON_OPTION_HANDLERS
+ VCONN_SSL_OPTION_HANDLERS
+ LEAK_CHECKER_OPTION_HANDLERS
+
+#ifdef HAVE_OPENSSL
+ case OPT_PEER_CA_CERT:
+ vconn_ssl_set_peer_ca_cert_file(optarg);
+ break;
+#endif
+
+ case '?':
+ exit(EXIT_FAILURE);
+
+ default:
+ abort();
+ }
+ }
+ free(short_options);
+
+ argc -= optind;
+ argv += optind;
+
+ if (argc != 1) {
+ ovs_fatal(0, "config file is only non-option argument; "
+ "use --help for usage");
+ }
+
+ config_file = argv[0];
+ error = cfg_set_file(config_file);
+ if (error) {
+ ovs_fatal(error, "failed to add configuration file \"%s\"",
+ config_file);
+ }
+}
+
+static void
+usage(void)
+{
+ printf("%s: virtual switch daemon\n"
+ "usage: %s [OPTIONS] CONFIG\n"
+ "CONFIG is a configuration file in ovs-vswitchd.conf(5) format.\n",
+ program_name, program_name);
+ daemon_usage();
+ vlog_usage();
+ printf("\nLegacy compatibility options:\n"
+ " --fake-proc-net simulate some files in /proc/net\n"
+ "\nOther options:\n"
+ " -h, --help display this help message\n"
+ " -V, --version display version information\n");
+ leak_checker_usage();
+ exit(EXIT_SUCCESS);
+}
diff --git a/vswitchd/ovs-vswitchd.conf.5.in b/vswitchd/ovs-vswitchd.conf.5.in
new file mode 100644
index 00000000..89872184
--- /dev/null
+++ b/vswitchd/ovs-vswitchd.conf.5.in
@@ -0,0 +1,642 @@
+.\" -*- nroff -*-
+.de TQ
+. br
+. ns
+. TP "\\$1"
+..
+.de IQ
+. br
+. ns
+. IP "\\$1"
+..
+.de ST
+. PP
+. RS -0.15in
+. I "\\$1"
+. RE
+. PP
+..
+.TH ovs\-vswitchd.conf 5 "April 2009" "Open vSwitch" "OpenVSwitch Manual"
+.
+.SH NAME
+ovs\-vswitchd.conf \- configuration file for \fBovs\-vswitchd\fR
+.
+.SH DESCRIPTION
+This manual page describes the syntax for the configuration file used
+by \fBovs\-vswitchd\fR(8), the virtual switch daemon.
+.PP
+The configuration file is based on key-value pairs, which are given
+one per line in the form \fIkey\fB=\fIvalue\fR. Each \fIkey\fR
+consists of one or more parts separated by dots,
+e.g. \fIpart1\fB.\fIpart2\fB.\fIpart3\fR. Each \fIpart\fR may consist
+only of the English letters, digits, and the special characters
+\fB_-@$:+\fR. White space within \fIvalue\fR and at the beginning of a
+line is significant, but is otherwise ignored.
+.PP
+If a single key is specified more than once, that key has multiple
+values, one value for each time the key is specified. The ordering of
+key-value pairs, and the ordering of multiple values for a single key,
+within a configuration file is not significant.
+.PP
+Blank lines, lines that consist only of white space, and lines that
+begin with \fB#\fR (optionally preceded by white space) are ignored.
+Keep in mind that programs that modify the configuration file, such as
+\fBovs\-brcompatd\fR and \fBovs-cfg-mod\fR, may alter the order of
+elements and
+strip comments and blank lines.
+.PP
+The following subsections describe how key-value pairs are used to
+configure \fBovs\-vswitchd\fR.
+.SS "Bridge Configuration"
+A bridge (switch) with a given \fIname\fR is configured by specifying
+the names of its network devices as values for key
+\fBbridge.\fIname\fB.port\fR. (The specified \fIname\fR may not begin
+with \fBdp\fR or \fBnl:\fR followed by a digit.)
+.PP
+The names given on \fBbridge.\fIname\fB.port\fR must be the names of
+existing network devices, except for ``internal ports.'' An internal
+port is a simulated network device that receives traffic only
+through the virtual switch and switches any traffic sent it through
+virtual switch. An internal port may configured with an IP address,
+etc. using the usual system tools (e.g. \fBifconfig\fR, \fBip\fR). To
+designate network device \fInetdev\fR as an internal port, add
+\fBiface.\fInetdev\fB.internal=true\fR to the configuration file.
+\fBovs\-vswitchd\fR will honor this configuration setting by automatically
+creating the named internal port.
+.PP
+A bridge with a given \fIname\fR always has an internal port with the
+same \fIname\fR, called the ``local port.'' This network device may
+be included
+in the bridge, by specifying it as one of the values for key
+\fBbridge.\fIname\fB.port\fR, or it may be omitted. If it is
+included, then its MAC address is by default the lowest-numbered MAC
+address among the other bridge ports, ignoring other internal ports
+and bridge ports that are
+used as port mirroring destinations (see \fBPort Mirroring\fR, below). To
+use a specific MAC address instead, set \fBbridge.\fIname\fB.mac\fR to
+a MAC address in the format
+\fIxx\fB:\fIxx\fB:\fIxx\fB:\fIxx\fB:\fIxx\fB:\fIxx\fR, where each
+\fIx\fR is a hex digit. If no valid MAC address can be determined
+either of these ways, then a MAC address is randomly generated.
+.PP
+The following syntax defines a bridge named \fBmybr\fR, configured
+with network devices \fBeth0\fR, \fBeth1\fR, and \fBeth2\fR:
+.RS
+.nf
+
+bridge.mybr.port=eth0
+bridge.mybr.port=eth1
+bridge.mybr.port=eth2
+
+.fi
+.RE
+.SS "802.1Q VLAN support"
+A bridge port may be configured either as a trunk port or as belonging
+to a single, untagged VLAN. These two options are mutually exclusive,
+and a port must be configured in one way or the other.
+.ST "Trunk Ports"
+By default, bridge ports are trunk ports that carry all VLANs. To
+limit the VLANs that a trunk port carries, define
+\fBvlan.\fIport\fB.trunks\fR to one or more integers between 0 and
+4095 designating VLANs. Only frames that have an 802.1Q header with
+one of the listed VLANs are accepted on a trunk port. If 0 is
+included in the list, then frames without an 802.1Q header are also
+accepted. Other frames are discarded.
+.PP
+The following syntax makes network device \fBeth0\fR a trunk port that
+carries VLANs 1, 2, and 3:
+.PP
+.RS
+.nf
+
+vlan.eth0.trunks=1
+vlan.eth0.trunks=2
+vlan.eth0.trunks=3
+
+.fi
+.RE
+.ST "Untagged VLAN Ports"
+A bridge port may be configured with an implicit, untagged VLAN.
+Define key
+\fBvlan.\fIport\fB.tag\fR to an integer value \fIvid\fR between 0 and
+4095, inclusive, to designate the named \fIport\fR as a member
+of 802.1Q VLAN \fIvid\fR. When \fIport\fR is assigned a VLAN tag this
+way, frames arriving on trunk ports will be forwarded to \fIport\fR
+only if they are tagged with VLAN \fIvid\fR, and frames arriving on
+other VLAN ports will be forwarded to \fIport\fR only if their
+\fIvid\fR values are equal. Frames forwarded to \fIport\fR will not
+have an 802.1Q header.
+.PP
+When \fIvid\fR is 0, frames arriving on trunk ports without an 802.1Q
+VLAN header will also be forwarded to \fIport\fR.
+.PP
+When a frame with a 802.1Q header that indicates a nonzero VLAN is
+received on an implicit VLAN port, it is discarded.
+.PP
+The following syntax makes network device \fBeth0\fR a member of VLAN
+101:
+.PP
+.RS
+.nf
+
+vlan.eth0.tag=101
+
+.fi
+.RE
+.SS "Network Device Bonding"
+Bonding allows multiple ``slave'' network devices to be treated as if
+they were a single virtual ``bonded'' network device. It is useful for
+load balancing and fail-over.
+.PP
+\fBovs\-vswitchd\fR supports ``source load balancing'' (SLB) bonding, which
+assigns flows to slaves based on source MAC address, with periodic
+rebalancing as traffic patterns change. This form of bonding does not
+require 802.3ad or other special support from the upstream switch to
+which the slave devices are connected.
+.PP
+To configure bonding, create a virtual bonding device by specifying
+the slave network device names as values for
+\fBbonding.\fIname\fB.slave\fR, then specify \fIname\fR as a bridge
+port. The chosen \fIname\fR should not be the name of any real
+network device on the host system.
+.PP
+By default, bonding interfaces are enabled or disabled immediately
+when a carrier is detected or dropped on the underlying network
+device. To insert a delay when carrier comes up or goes down before
+enabling or disabling an interface, set the value of
+\fBbonding.\fIname\fB.updelay\fR or
+\fBbonding.\fIname\fB.downdelay\fR, respectively, to a positive
+integer, interpreted in milliseconds.
+.PP
+The following syntax bonds \fBeth0\fR and \fBeth1\fR into a bonding
+device named \fBbond0\fR, which is added to bridge \fBmybr\fR along
+with physical network devices \fBeth2\fR and \fBeth3\fR:
+.PP
+.RS
+.nf
+
+bridge.mybr.port=bond0
+bridge.mybr.port=eth2
+bridge.mybr.port=eth3
+
+bonding.bond0.slave=eth0
+bonding.bond0.slave=eth1
+
+.fi
+.RE
+.SS "Port Mirroring (SPAN and RSPAN)"
+\fBovs\-vswitchd\fR may be configured to send selected frames to special
+``mirrored'' ports, in addition to their normal destinations. Mirroring
+traffic may also be referred to as SPAN or RSPAN, depending on the
+mechanism used for delivery.
+.PP
+Up to 32 instances of port mirroring may be configured on a given
+bridge. Each must be given a name that is unique within the bridge.
+The keys associated with port mirroring instance \fIpmname\fR for
+bridge \fIbrname\fR begin with \fBmirror.\fIbrname\fB.\fIpmname\fR.
+.PP
+The selection of the frames to mirror and the form in which they
+should be output is configured separately for each port mirroring
+instances, through a subsection of
+\fBmirror.\fIbrname\fB.\fIpmname\fR, named \fBselect\fR, and
+\fBoutput\fR, respectively.
+.ST "Selecting Frames to Mirror"
+The values for the following keys, if specified, limit the frames that
+are chosen for mirroring. If none of these keys is specified, then
+all frames received by the bridge are mirrored. If more than one of
+these keys is specified, then a frame must meet all specified criteria
+to be mirrored.
+.TP
+\fBmirror.\fIbrname\fB.\fIpmname\fB.select.src-port=\fIport\fR
+.TQ
+\fBmirror.\fIbrname\fB.\fIpmname\fB.select.dst-port=\fIport\fR
+.TQ
+\fBmirror.\fIbrname\fB.\fIpmname\fB.select.port=\fIport\fR
+Frame received on \fIport\fR, output to \fIport\fR, or either received
+on or output to \fIport\fR, respectively. \fIport\fR must be part of
+the bridge \fIbrname\fR; that is, it must be listed on
+\fBbridge.\fIbrname\fB.port\fR.
+.TP
+\fBmirror.\fIbrname\fB.\fIpmname\fB.select.vlan=\fIvid\fR
+.
+\fIvid\fR must be an integer between 0 and 4095, inclusive. A nonzero
+\fIvid\fR selects frames that belong to VLAN \fIvid\fR, that is,
+frames that arrived on a trunk port tagged with VLAN \fIvid\fR or on a
+port that is configured as part of VLAN \fIvid\fR (see \fB802.1Q VLAN
+tagging\fR, above). A \fIvid\fR of zero selects frames that do not
+belong to a VLAN, that is, frames that arrived on a trunk port without
+a VLAN tag or tagged with VLAN 0.
+.ST "Mirror Output"
+The values of the following keys determine how frames selected for
+mirroring are output. Only one of the keys may be specified.
+.TP
+\fBmirror.\fIbrname\fB.\fIpmname\fB.output.port=\fIport\fR
+.
+Causes the selected frames to be sent out \fIport\fR, which must be
+part of the bridge \fIbrname\fR; that is, it must be listed on
+\fBbridge.\fIbrname\fB.port\fR.
+.IP
+Specifying a \fIport\fR in this way reserves that port exclusively for
+mirroring. No frames other than those selected for mirroring will be
+forwarded to \fIport\fR, and any frames received on \fIport\fR will be
+discarded. This type of mirroring may be referred to as SPAN.
+.TP
+\fBmirror.\fIbrname\fB.\fIpmname\fB.output.vlan=\fIvid\fR
+.
+Causes the selected frames to be sent on the VLAN numbered \fIvid\fR,
+which must be an integer between 0 and 4095, inclusive. The frames
+will be sent out all ports that trunk VLAN \fIvid\fR, as well as any
+ports with implicit VLAN \fIvid\fR. When a mirrored frame is sent out
+a trunk port, the frame's VLAN tag will be set to \fIvid\fR, replacing
+any existing tag; when it is sent out an implicit VLAN port, the frame
+will not be tagged. This type of mirroring may be referred to as
+RSPAN.
+.ST "Example"
+The following \fBovs\-vswitchd\fR configuration copies all frames received
+on \fBeth1\fR or \fBeth2\fR to \fBeth3\fR.
+.PP
+.RS
+.nf
+
+bridge.mybr.port=eth1
+bridge.mybr.port=eth2
+bridge.mybr.port=eth3
+
+mirror.mybr.a.select.src-port=eth1
+mirror.mybr.a.select.src-port=eth2
+mirror.mybr.a.output.port=eth3
+
+.fi
+.RE
+.SS "Port Rate-Limiting"
+Traffic policing and shaping are configured on physical ports. Policing
+defines a hard limit at which traffic that exceeds the specified rate is
+dropped. Shaping uses queues to delay packets so that egress traffic
+leaves at the specified rate.
+
+.ST "Ingress Policing"
+The rate at which traffic is allowed to enter through a particular
+physical port can be configured with ingress policing. The rate is
+specified in kilobits (1000 bits) per second with a maximum burst size
+specified in kilobits (1000 bits). The burst size should be at least
+the size of the port's MTU.
+
+A port may be configured to enforce ingress policing by defining the
+key \fBport.\fIname\fB.ingress.policing-rate\fR with an integer
+indicating the rate. The port \fIname\fR will only allow traffic to be
+received at the rate specified in kilobits per second. If the rate is zero
+or the key is not defined, then ingress policing is disabled.
+
+If ingress policing is enabled, then the burst rate may be set by defining
+the key \fBport.\fIname\fB.ingress.policing-burst\fR with an integer
+indicating the burst rate in kilobits. If the key is not supplied or is
+zero, then the default burst is 10 kilobits.
+
+.PP
+The following syntax limits port \fBeth1\fR to receiving traffic at
+\fB512\fR kilobits per second with a burst of \fB20\fR kilobits:
+.PP
+.RS
+.nf
+
+port.eth1.ingress.policing-rate=512
+port.eth1.ingress.policing-burst=20
+
+.fi
+.SS "NetFlow v5 Flow Logging"
+NetFlow is a protocol that exports a number of details about terminating
+IP flows, such as the principals involved and duration. A bridge may be
+configured to send NetFlow v5 records to NetFlow collectors when flows
+end. To enable, define the key \fBnetflow.\fIbridge\fB.host\fR for each
+collector in the form \fIhost\fB:\fIport\fR. Records from \fIbridge\fR
+will be sent to each \fIhost\fR on UDP \fIport\fR.
+
+The NetFlow messages will use the datapath index for the engine type and id.
+This can be overridden with the \fBnetflow.\fIbridge\fB.engine-type\fR and
+\fBnetflow.\fIbridge\fB.engine-id\fR, respectively. Each takes a value
+between 0 and 255, inclusive.
+
+Many NetFlow collectors do not expect multiple virtual switches to be
+sending messages from the same host, and they do not store the engine
+information which could be used to disambiguate the traffic. To prevent
+flows from multiple switches appearing as if they came on the interface,
+add \fBnetflow.\fIbridge\fB.add-id-to-iface=true\fR to the configuration
+file. This will place the least significant 7 bits of the engine id
+into the most significant bits of the ingress and egress interface fields
+of flow records. By default, this behavior is disabled.
+
+The following syntax sends NetFlow records for \fBmybr\fR to the NetFlow
+collector \fBnflow.example.com\fR on UDP port \fB9995\fR:
+.PP
+.RS
+.nf
+
+netflow.mybr.host=nflow.example.com:9995
+
+.fi
+.RE
+.SS "Remote Management"
+A \fBovs\-vswitchd\fR instance may be remotely managed by a controller that
+supports the OpenFlow Management Protocol, such as NOX. This
+functionality is enabled by setting the key \fBmgmt.controller\fR to one
+of the following values:
+.
+.TP
+\fBssl:\fIhost\fR[\fB:\fIport\fR]
+The specified SSL \fIport\fR (default: 6633) on the given remote
+\fIhost\fR. SSL must be configured when this form is used (see \fBSSL
+Configuration\fR, below).
+.
+.TP
+\fBtcp:\fIhost\fR[\fB:\fIport\fR]
+The specified TCP \fIport\fR (default: 6633) on the given remote
+\fIhost\fR.
+.PP
+The maximum time between attempts to connect to the controller may be
+specified in integral seconds with the \fBmgmt.max-backoff\fR key. The
+default maximum backoff is 15 seconds, and the minimum value is 1
+second.
+
+An inactivity probe may be configured with the \fBmgmt.inactivity-probe\fR
+key. If \fBovs\-vswitchd\fR does not communicate with the controller for the
+specified number of seconds, it will send a probe. If a response is not
+received for an additional amount of that time, \fBovs\-vswitchd\fR assumes
+the connection has been broken and attempts to reconnect. The default
+is 15 seconds, and the minimum value is 5 seconds.
+
+A management id may be specified with the \fBmgmt.id\fR key. It takes
+an id in the form of exactly 12 hexadecimal digits. If one is not
+specified, a random id is generated each time \fBovs\-vswitchd\fR is started.
+.fi
+.RE
+.SS "OpenFlow Controller Connectivity"
+\fBovs\-vswitchd\fR can perform all configured bridging and switching
+locally, or it can be configured to connect a given bridge to an
+external OpenFlow controller, such as NOX. Its behavior depends on
+the \fBbridge.\fIname\fB.controller\fR setting:
+.
+.TP
+\fI\[la]unset\[ra]\fR
+When the key is not set, the behavior depends on whether remote
+management is configured. If management is configured, then the switch
+will connect to the controller specified on \fBmgmt.controller\fR. If
+management is not configured, the switch will perform all configured
+bridging and switching locally.
+.
+.TP
+\fI\[la]empty\[ra]\fR
+Setting an empty string value disables controller connectivity. The
+switch will perform all configured bridging and switching locally.
+.
+.TP
+\fBdiscover\fR
+Use controller discovery to find the local OpenFlow controller.
+Refer to \fBsecchan\fR(8) for information on how to configure a DHCP
+server to support controller discovery. The following additional
+options control the discovery process:
+.
+.RS
+.TP
+\fBbridge.\fIname\fB.controller.accept-regex=\fIregex\fR
+A POSIX extended regular expression against which the discovered
+controller location is validated. Only controllers whose names match
+the regular expression will be accepted.
+.IP
+The default regular expression is \fBssl:.*\fR, meaning that only SSL
+controller connections will be accepted, when SSL is configured (see
+\fBSSL Configuration\fR), and \fB.*\fR otherwise, meaning that any
+controller will be accepted.
+.IP
+The regular expression is implicitly anchored at the beginning of the
+controller location string, as if it begins with \fB^\fR.
+.TP
+\fBbridge.\fIname\fB.controller.update-resolv.conf=\fBtrue\fR|\fBfalse\fR
+By default, or if this is set to \fBtrue\fR, \fBovs\-vswitchd\fR overwrites
+the system's \fB/etc/resolv.conf\fR with domain information and DNS
+servers obtained via DHCP. If this setting is \fBfalse\fR,
+\fBovs\-vswitchd\fR will not modify \fB/etc/resolv.conf\fR.
+.IP
+\fBovs\-vswitchd\fR will only modify \fBresolv.conf\fR if the DHCP response
+that it receives specifies one or more DNS servers.
+.RE
+.
+.TP
+\fBssl:\fIhost\fR[\fB:\fIport\fR]
+The specified SSL \fIport\fR (default: 6633) on the given remote
+\fIhost\fR. SSL must be configured when this form is used (see \fBSSL
+Configuration\fR, below).
+.
+.TP
+\fBtcp:\fIhost\fR[\fB:\fIport\fR]
+The specified TCP \fIport\fR (default: 6633) on the given remote
+\fIhost\fR.
+.
+.TP
+\fBunix:\fIfile\fR
+The Unix domain server socket named \fIfile\fR.
+.PP
+The datapath ID used by the bridge to identify itself to the remote
+controller may be specified as \fBbridge.\fIname\fB.datapath-id\fR,
+in the form of exactly 12 hexadecimal digits. If the datapath ID
+is not specified, then it defaults to the bridge's MAC address (see
+\fBBridge Configuration\fR, above, for information on how the bridge's
+MAC address is chosen).
+.ST "Local Port Network Configuration"
+When an external controller is configured, but controller discovery is
+not in use, the following additional settings are honored:
+.TP
+\fBbridge.\fIname\fB.controller.in-band=\fBtrue\fR|\fBfalse\fR
+By default, or if this is set to \fBtrue\fR, \fBovs\-vswitchd\fR connects
+to the controller in-band. If this is set to \fBfalse\fR,
+\fBovs\-vswitchd\fR connects to the controller out-of-band. Refer to
+\fBsecchan\fR(8) for a description of in-band and out-of-band control.
+.IP "\fBbridge.\fIname\fB.controller.ip=\fIip\fR"
+If specified, the IP address to configure on the bridge's local port.
+.IP "\fBbridge.\fIname\fB.controller.netmask=\fInetmask\fR"
+When an IP is specified, the corresponding netmask. The default is
+255.255.255.0 for a Class C IP address, 255.255.0.0 for Class B, and
+255.0.0.0 for Class A.
+.IP "\fBbridge.\fIname\fB.controller.gateway=\fIip\fR"
+When an IP is specified, the corresponding IP gateway. There is no
+default gateway.
+.ST "Controller Failure Settings"
+The following additional settings take effect when any remote
+controller is configured:
+.IP "\fBbridge.\fIname\fB.controller.inactivity-probe=\fIsecs\fR"
+This optional setting may be set to \fIsecs\fR, a number of seconds.
+The minimum value of \fIsecs\fR is 5 seconds. The default is taken
+from \fBmgmt.inactivity-probe\fR (see above).
+.IP
+When the virtual switch is connected to the controller, it waits for a
+message to be received from the controller for \fIsecs\fR seconds
+before it sends a inactivity probe to the controller. After sending
+the inactivity probe, if no response is received for an additional
+\fIsecs\fR seconds, the secure channel assumes that the connection has
+been broken and attempts to reconnect.
+.IP
+Changing the inactivity probe interval also changes the interval
+before entering standalone mode (see below).
+.IP "\fBbridge.\fIname\fB.controller.fail-mode=\fBstandalone\fR|\fBsecure\fR"
+.IQ "\fBmgmt.fail-mode=standalone\fR|\fBsecure\fR"
+When a controller is configured, it is, ordinarily, responsible for
+setting up all flows on the virtual switch. Thus, if the connection to
+the controller fails, no new network connections can be set up. If
+the connection to the controller stays down long enough, no packets
+can pass through the switch at all.
+.IP
+The first of these that is set takes effect.
+If the value is \fBstandalone\fR, \fBovs\-vswitchd\fR will take over
+responsibility for setting up
+flows when no message has been received from the controller for three
+times the inactivity probe interval (see above). In this mode,
+\fBovs\-vswitchd\fR causes the datapath to act like an ordinary
+MAC-learning switch. \fBovs\-vswitchd\fR will continue to retry connecting
+to the controller in the background and, when the connection succeeds,
+it discontinues its standalone behavior.
+.IP
+If this option is set to \fBsecure\fR, or if neither of these settings
+is set, \fBovs\-vswitchd\fR will not set up flows on its own when the
+controller connection fails.
+.IP "\fBbridge.\fIname\fB.controller.max-backoff=\fIsecs\fR"
+Sets the maximum time between attempts to connect to the controller to
+\fIsecs\fR, which must be at least 1. The actual interval between
+connection attempts starts at 1 second and doubles on each failing
+attempt until it reaches the maximum. The default maximum backoff
+time is taken from \fBmgmt.max-backoff\fR.
+.ST "Controller Rate-Limiting"
+These settings configure how the virtual switch applies a ``token
+bucket'' to limit the rate at which packets in unknown flows are
+forwarded to the OpenFlow controller for flow-setup processing. This
+feature prevents a single bridge from overwhelming a controller.
+.IP "\fBbridge.\fIname\fB.controller.rate-limit=\fIrate\fR"
+.IQ "\fBmgmt.rate-limit=\fIrate\fR"
+Limits the maximum rate at which packets will be forwarded to the
+OpenFlow controller to \fIrate\fR packets per second. A rate specified
+explicitly for \fIname\fR overrides a value configured using the
+\fBmgmt.rate-limit\fR key.
+.IP
+If neither one of these settings is set, then the bridge does not
+limit the rate at which packets are forwarded to the controller.
+.IP "\fBbridge.\fIname\fB.controller.burst-limit=\fIburst\fR"
+.IQ "\fBmgmt.burst-limit=\fIburst\fR"
+Sets the maximum number of unused packet credits that the bridge will
+allow to accumulate during the time in which no packets are being
+forwarded to the OpenFlow controller to \fIburst\fR (measured in
+packets). The default \fIburst\fR is one-quarter of the \fIrate\fR
+specified in the rate-limit setting.
+.IP
+A burst specified explicitly for \fIname\fR overrides a value configured
+using the \fBmgmt.burst-limit\fR key. This option takes effect only
+when a rate-limit is specified.
+.ST "Remote Command Execution Settings"
+These settings configure the commands that remote OpenFlow connections
+are allowed to invoke using (e.g.) \fBovs\-ofctl execute\fR. To be
+permitted, a command name must be whitelisted and must not be
+blacklisted. When the whitelist and blacklist permit a command name,
+\fBovs\-vswitchd\fR looks for a program with the same name as the command
+in the commands directory (see below). Other directories are not
+searched.
+.IP "\fBbridge.\fIname\fB.controller.commands.acl=\fIglob\fR"
+Whitelists commands whose names match shell glob pattern \fIglob\fR,
+allowing those commands to be invoked by the remote controller.
+.IP
+By default, no commands are whitelisted, so this setting is mandatory
+if any remote command execution is to be allowed.
+.IP "\fBbridge.\fIname\fB.controller.commands.acl=\fB!\fR\fIglob\fR"
+Blacklists commands whose names match shell glob pattern \fIglob\fR,
+prohibiting those commands from being invoked by the remote
+controller. Command names that include characters other than upper-
+and lower-case English letters, digits, and the underscore and hyphen
+characters are blacklisted unconditionally.
+.IP "\fBbridge.\fIname\fB.controller.commands.dir=\fIdirectory\fR"
+Sets the directory searched for remote command execution to
+\fIdirectory\fR. The default directory is
+\fB@pkgdatadir@/commands\fR.
+.SS "SSL Configuration"
+When \fBovs\-vswitchd\fR is configured to connect over SSL for management or
+for controller connectivity, the following settings are required:
+.TP
+\fBssl.private-key=\fIprivkey.pem\fR
+Specifies a PEM file containing the private key used as the virtual
+switch's identity for SSL connections to the controller.
+.TP
+\fBssl.certificate=\fIcert.pem\fR
+Specifies a PEM file containing a certificate, signed by the
+certificate authority (CA) used by the controller and manager, that
+certifies the virtual switch's private key, identifying a trustworthy
+switch.
+.TP
+\fBssl.ca-cert=\fIcacert.pem\fR
+Specifies a PEM file containing the CA certificate used to verify that
+the virtual switch is connected to a trustworthy controller.
+.PP
+These files are read only once, at \fBovs\-vswitchd\fR startup time. If
+their contents change, \fBovs\-vswitchd\fR must be killed and restarted.
+.PP
+These SSL settings apply to all SSL connections made by the virtual
+switch.
+.ST "CA Certificate Bootstrap"
+Ordinarily, all of the files named in the SSL configuration must exist
+when \fBovs\-vswitchd\fR starts. However, if \fBssl.bootstrap-ca-cert\fR
+is set to \fBtrue\fR, then \fBovs\-vswitchd\fR will attempt to obtain the
+CA certificate from the controller on its first SSL connection and
+save it to the named PEM file. If it is successful, it will
+immediately drop the connection and reconnect, and from then on all
+SSL connections must be authenticated by a certificate signed by the
+CA certificate thus obtained.
+.PP
+\fBThis option exposes the SSL connection to a man-in-the-middle
+attack obtaining the initial CA certificate\fR, but it may be useful
+for bootstrapping.
+.PP
+This option is only useful if the controller sends its CA certificate
+as part of the SSL certificate chain. The SSL protocol does not
+require the controller to send the CA certificate, but
+\fBcontroller\fR(8) can be configured to do so with the
+\fB--peer-ca-cert\fR option.
+.SS "OpenFlow Management Connections"
+By default, each bridge \fIname\fR listens for OpenFlow management
+connections on a Unix domain socket named
+\fB@RUNDIR@/\fIname\fB.mgmt\fR. This socket can be used to perform
+local OpenFlow monitoring and administration, e.g., \fBovs\-ofctl dump-flows
+unix:@RUNDIR@/\fIname\fB.mgmt\fR to display the flows currently set up
+in bridge \fIname\fR.
+.PP
+If \fBbridge.\fIname\fB.openflow.listeners\fR is set to one or more
+values, \fBovs\-vswitchd\fR instead listens on the specified connection
+methods. Acceptable connection methods include:
+.RS
+.IP "\fBpunix:\fIfile\fR"
+Listens for connections on the Unix domain server socket named \fIfile\fR.
+.IP "\fBpssl:\fR[\fIport\fR]"
+Listens for SSL connections on \fIport\fR (default: 6633). SSL must
+be configured when this form is used (see \fBSSL Configuration\fR,
+above).
+.IP "\fBptcp:\fR[\fIport\fR]"
+Listens for TCP connections on \fIport\fR (default: 6633).
+.RE
+To entirely disable listening for management connections, set
+\fBbridge.\fIname\fB.openflow.listeners\fR to the single value
+\fBnone\fR.
+
+.SS "OpenFlow Controller Connection Snooping"
+By default, each bridge \fIname\fR listens for OpenFlow controller
+connection snooping connections on a Unix domain socket named
+\fB@RUNDIR@/\fIname\fB.snoop\fR. A client that connects to this
+socket, e.g., \fBovs\-ofctl monitor unix:@RUNDIR@/\fIname\fB.snoop\fR, will
+receive a copy of every OpenFlow message sent by the switch to the
+controller, or vice versa, on the primary OpenFlow controller
+connection.
+.PP
+If \fBbridge.\fIname\fB.openflow.snoops\fR is set to one or more
+values, \fBovs\-vswitchd\fR instead listens on the specified connection
+methods. The acceptable connection methods are the same as for
+OpenFlow management connections (see above).
+.PP
+To entirely disable controller connection snooping, set
+\fBbridge.\fIname\fB.openflow.snoops\fR to the single value
+\fBnone\fR.
+.SH "SEE ALSO"
+.BR ovs\-brcompatd (8),
+.BR ovs\-cfg\-mod (8),
+.BR ovs\-vswitchd (8)
diff --git a/vswitchd/ovs-vswitchd.h b/vswitchd/ovs-vswitchd.h
new file mode 100644
index 00000000..f292d682
--- /dev/null
+++ b/vswitchd/ovs-vswitchd.h
@@ -0,0 +1,32 @@
+/* Copyright (c) 2009 Nicira Networks
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * In addition, as a special exception, Nicira Networks gives permission
+ * to link the code of its release of vswitchd with the OpenSSL project's
+ * "OpenSSL" library (or with modified versions of it that use the same
+ * license as the "OpenSSL" library), and distribute the linked
+ * executables. You must obey the GNU General Public License in all
+ * respects for all of the code used other than "OpenSSL". If you modify
+ * this file, you may extend this exception to your version of the file,
+ * but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version.
+ */
+
+#ifndef VSWITCHD_H
+#define VSWITCHD_H 1
+
+void reconfigure(void);
+
+#endif /* ovs-vswitchd.h */
diff --git a/vswitchd/port.c b/vswitchd/port.c
new file mode 100644
index 00000000..f6348f35
--- /dev/null
+++ b/vswitchd/port.c
@@ -0,0 +1,68 @@
+/* Copyright (c) 2009 Nicira Networks
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * In addition, as a special exception, Nicira Networks gives permission
+ * to link the code of its release of vswitchd with the OpenSSL project's
+ * "OpenSSL" library (or with modified versions of it that use the same
+ * license as the "OpenSSL" library), and distribute the linked
+ * executables. You must obey the GNU General Public License in all
+ * respects for all of the code used other than "OpenSSL". If you modify
+ * this file, you may extend this exception to your version of the file,
+ * but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version.
+ *
+ */
+
+#include <config.h>
+
+#include "bridge.h"
+#include "cfg.h"
+#include "netdev.h"
+#include "ovs-vswitchd.h"
+#include "port.h"
+#include "svec.h"
+
+#define THIS_MODULE VLM_port
+#include "vlog.h"
+
+static int
+set_ingress_policing(const char *port_name)
+{
+ int kbits_rate = cfg_get_int(0, "port.%s.ingress.policing-rate",
+ port_name);
+ int kbits_burst = cfg_get_int(0, "port.%s.ingress.policing-burst",
+ port_name);
+
+ return netdev_nodev_set_policing(port_name, kbits_rate, kbits_burst);
+}
+
+void
+port_init(void)
+{
+ port_reconfigure();
+}
+
+void
+port_reconfigure(void)
+{
+ struct svec ports;
+ int i;
+
+ svec_init(&ports);
+ bridge_get_ifaces(&ports);
+ for (i=0; i<ports.n; i++) {
+ set_ingress_policing(ports.names[i]);
+ }
+}
diff --git a/vswitchd/port.h b/vswitchd/port.h
new file mode 100644
index 00000000..55c2d7bc
--- /dev/null
+++ b/vswitchd/port.h
@@ -0,0 +1,33 @@
+/* Copyright (c) 2009 Nicira Networks
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * In addition, as a special exception, Nicira Networks gives permission
+ * to link the code of its release of vswitchd with the OpenSSL project's
+ * "OpenSSL" library (or with modified versions of it that use the same
+ * license as the "OpenSSL" library), and distribute the linked
+ * executables. You must obey the GNU General Public License in all
+ * respects for all of the code used other than "OpenSSL". If you modify
+ * this file, you may extend this exception to your version of the file,
+ * but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version.
+ */
+
+#ifndef VSWITCHD_PORT_H
+#define VSWITCHD_PORT_H 1
+
+void port_init(void);
+void port_reconfigure(void);
+
+#endif /* port.h */
diff --git a/vswitchd/proc-net-compat.c b/vswitchd/proc-net-compat.c
new file mode 100644
index 00000000..3f5cf44a
--- /dev/null
+++ b/vswitchd/proc-net-compat.c
@@ -0,0 +1,344 @@
+/* Copyright (c) 2009 Nicira Networks
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include <config.h>
+#include "proc-net-compat.h"
+#include <assert.h>
+#include <dirent.h>
+#include <errno.h>
+#include <inttypes.h>
+#include <string.h>
+#include "dynamic-string.h"
+#include "hash.h"
+#include "netlink-protocol.h"
+#include "netlink.h"
+#include "ofpbuf.h"
+#include "openvswitch/brcompat-netlink.h"
+#include "hmap.h"
+#include "shash.h"
+#include "svec.h"
+
+#define THIS_MODULE VLM_proc_net_compat
+#include "vlog.h"
+
+/* Netlink socket to bridge compatibility kernel module. */
+static struct nl_sock *brc_sock;
+
+/* The Generic Netlink family number used for bridge compatibility. */
+static int brc_family = 0;
+
+/* Rate limiting for log messages. */
+static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
+
+static void flush_dir(const char *dir);
+static int set_proc_file(const char *dir, const char *file, const char *data);
+
+/* Initializes the /proc/net compatibility layer. Returns 0 if successful,
+ * otherwise a positive errno value. */
+int
+proc_net_compat_init(void)
+{
+ if (!brc_sock) {
+ int retval = nl_lookup_genl_family(BRC_GENL_FAMILY_NAME, &brc_family);
+ if (retval) {
+ return retval;
+ }
+
+ retval = nl_sock_create(NETLINK_GENERIC, 0, 0, 0, &brc_sock);
+ if (retval) {
+ return retval;
+ }
+
+ flush_dir("/proc/net/vlan");
+ flush_dir("/proc/net/bonding");
+ }
+ return 0;
+}
+
+static int
+set_proc_file(const char *dir, const char *file, const char *data)
+{
+ struct ofpbuf request, *reply;
+ int retval;
+
+ ofpbuf_init(&request, 0);
+ nl_msg_put_genlmsghdr(&request, brc_sock, 1024, brc_family, NLM_F_REQUEST,
+ BRC_GENL_C_SET_PROC, 1);
+ nl_msg_put_string(&request, BRC_GENL_A_PROC_DIR, dir);
+ nl_msg_put_string(&request, BRC_GENL_A_PROC_NAME, file);
+ if (data) {
+ nl_msg_put_string(&request, BRC_GENL_A_PROC_DATA, data);
+ }
+
+ retval = nl_sock_transact(brc_sock, &request, &reply);
+ ofpbuf_uninit(&request);
+ ofpbuf_delete(reply);
+ if (retval) {
+ VLOG_WARN_RL(&rl, "failed to %s /proc/%s/%s (%s)",
+ data ? "update" : "remove", dir, file, strerror(retval));
+ }
+ return retval;
+}
+
+static void
+flush_dir(const char *dir)
+{
+ const char *subdir;
+ struct dirent *de;
+ DIR *stream;
+
+ assert(!memcmp(dir, "/proc/", 6));
+ subdir = dir + 6;
+
+ stream = opendir(dir);
+ if (!stream) {
+ if (errno != ENOENT) {
+ VLOG_WARN_RL(&rl, "%s: open failed (%s)", dir, strerror(errno));
+ }
+ return;
+ }
+
+ while ((de = readdir(stream)) != NULL) {
+ if (strcmp(de->d_name, ".") && strcmp(de->d_name, "..")) {
+ set_proc_file(subdir, de->d_name, NULL);
+ }
+ }
+ closedir(stream);
+}
+
+/* If 'bond' is nonnull, creates a file in /proc/net/bonding for a bond with
+ * the given 'name' and the details in 'bond'. If 'bond' is null, deletes
+ * the /proc/net/bonding file with the given 'name'.
+ *
+ * This function has no effect unless proc_net_compat_init() has been
+ * called. */
+void
+proc_net_compat_update_bond(const char *name, const struct compat_bond *bond)
+{
+ struct ds ds;
+ int i;
+
+ if (!brc_sock) {
+ return;
+ }
+
+ if (!bond) {
+ set_proc_file("net/bonding", name, NULL);
+ return;
+ }
+
+ ds_init(&ds);
+ ds_put_format(
+ &ds,
+ "Ethernet Channel Bonding Driver: ovs-vswitchd "
+ VERSION BUILDNR" ("__DATE__" "__TIME__")\n"
+ "Bonding Mode: source load balancing\n"
+ "Primary Slave: None\n"
+ "Currently Active Slave: None\n"
+ "MII Status: %s\n"
+ "MII Polling Interval (ms): 100\n"
+ "Up Delay (ms): %d\n"
+ "Down Delay (ms): %d\n"
+ "\n"
+ "Source load balancing info:\n",
+ bond->up ? "up" : "down", bond->updelay, bond->downdelay);
+ for (i = 0; i < bond->n_slaves; i++) {
+ const struct compat_bond_slave *slave = &bond->slaves[i];
+ ds_put_format(
+ &ds,
+ "\n"
+ "Slave Interface: %s\n"
+ "MII Status: %s\n"
+ "Link Failure Count: 0\n"
+ "Permanent HW addr: "ETH_ADDR_FMT"\n",
+ slave->name, slave->up ? "up" : "down",
+ ETH_ADDR_ARGS(slave->mac));
+ }
+ set_proc_file("net/bonding", name, ds_cstr(&ds));
+ ds_destroy(&ds);
+}
+
+/* /proc/net/vlan compatibility.
+ *
+ * This is much more complex than I expected it to be. */
+
+struct compat_vlan {
+ /* Hash key. */
+ struct hmap_node trunk_node; /* Hash map node. */
+ char *trunk_dev; /* Name of trunk network device. */
+ int vid; /* VLAN number. */
+
+ /* Auxiliary data. */
+ char *vlan_dev; /* sprintf("%s.%d", trunk_dev, vid); */
+ struct svec tagged_devs; /* Name of tagged network device(s). */
+};
+
+/* Current set of VLAN devices, indexed two different ways. */
+static struct hmap vlans_by_trunk = HMAP_INITIALIZER(&vlans_by_trunk);
+static struct shash vlans_by_tagged = SHASH_INITIALIZER(&vlans_by_tagged);
+
+static bool remove_tagged_dev(struct shash_node *, const char *tagged_dev);
+static void update_vlan_config(void);
+static void set_vlan_proc_file(const struct compat_vlan *);
+static uint32_t hash_vlan(const char *trunk_dev, uint32_t vid);
+
+/* Updates the /proc/net/vlan compatibility layer's idea of what trunk device
+ * and VLAN the given 'tagged_dev' is associated with. If 'tagged_dev' has an
+ * implicit VLAN tag, then 'trunk_dev' should be the name of a network device
+ * on the same bridge that trunks that VLAN, and 'vid' should be the VLAN tag
+ * number. If 'tagged_dev' does not have an implicit VLAN tag, then
+ * 'trunk_dev' should be NULL and 'vid' should be -1.
+ *
+ * This function has no effect unless proc_net_compat_init() has been
+ * called. */
+void
+proc_net_compat_update_vlan(const char *tagged_dev, const char *trunk_dev,
+ int vid)
+{
+ struct compat_vlan *vlan;
+ struct shash_node *node;
+
+ if (!brc_sock) {
+ return;
+ }
+
+ /* Find the compat_vlan that we currently have for 'tagged_dev' (if
+ * any). */
+ node = shash_find(&vlans_by_tagged, tagged_dev);
+ vlan = node ? node->data : NULL;
+ if (vid <= 0 || !trunk_dev) {
+ if (vlan) {
+ if (remove_tagged_dev(node, tagged_dev)) {
+ update_vlan_config();
+ }
+ }
+ } else {
+ if (vlan) {
+ if (!strcmp(trunk_dev, vlan->trunk_dev) && vid == vlan->vid) {
+ /* No change. */
+ return;
+ } else {
+ /* 'tagged_dev' is attached to the wrong compat_vlan. Start
+ * by removing it from that one. */
+ remove_tagged_dev(node, tagged_dev);
+ node = NULL;
+ vlan = NULL;
+ }
+ }
+
+ /* 'tagged_dev' is not attached to any compat_vlan. Find the
+ * compat_vlan corresponding to (trunk_dev,vid) to attach it to, or
+ * create a new compat_vlan if none exists for (trunk_dev,vid). */
+ HMAP_FOR_EACH_WITH_HASH (vlan, struct compat_vlan, trunk_node,
+ hash_vlan(trunk_dev, vid),
+ &vlans_by_trunk) {
+ if (!strcmp(trunk_dev, vlan->trunk_dev) && vid == vlan->vid) {
+ break;
+ }
+ }
+ if (!vlan) {
+ /* Create a new compat_vlan for (trunk_dev,vid). */
+ vlan = xcalloc(1, sizeof *vlan);
+ vlan->trunk_dev = xstrdup(trunk_dev);
+ vlan->vid = vid;
+ vlan->vlan_dev = xasprintf("%s.%d", trunk_dev, vid);
+ svec_init(&vlan->tagged_devs);
+ hmap_insert(&vlans_by_trunk, &vlan->trunk_node,
+ hash_vlan(trunk_dev, vid));
+ set_vlan_proc_file(vlan);
+ }
+
+ /* Attach 'tagged_dev' to 'vlan'. */
+ svec_add(&vlan->tagged_devs, tagged_dev);
+ shash_add(&vlans_by_tagged, tagged_dev, vlan);
+ svec_sort(&vlan->tagged_devs);
+ update_vlan_config();
+ }
+}
+
+/* Remove 'tagged_dev' from the compat_vlan in 'node'. If that causes the
+ * compat_vlan to have no tagged_devs left, destroy the compat_vlan too. */
+static bool
+remove_tagged_dev(struct shash_node *node, const char *tagged_dev)
+{
+ struct compat_vlan *vlan = node->data;
+
+ svec_del(&vlan->tagged_devs, tagged_dev);
+ shash_delete(&vlans_by_tagged, node);
+ if (!vlan->tagged_devs.n) {
+ set_proc_file("net/vlan", vlan->vlan_dev, NULL);
+
+ hmap_remove(&vlans_by_trunk, &vlan->trunk_node);
+ svec_destroy(&vlan->tagged_devs);
+ free(vlan->trunk_dev);
+ free(vlan->vlan_dev);
+ free(vlan);
+ return true;
+ }
+ return false;
+}
+
+/* Returns a hash value for (trunk_dev,vid). */
+static uint32_t
+hash_vlan(const char *trunk_dev, uint32_t vid)
+{
+ return hash_int(vid, hash_string(trunk_dev, 0));
+}
+
+/* Update /proc/net/vlan/<vlan_dev> for 'vlan'. */
+static void
+set_vlan_proc_file(const struct compat_vlan *vlan)
+{
+ struct ds ds;
+
+ ds_init(&ds);
+ ds_put_format(
+ &ds,
+ "%s VID: %d\t REORDER_HDR: 1 dev->priv_flags: 81\n"
+ " total frames received 0\n"
+ " total bytes received 0\n"
+ " Broadcast/Multicast Rcvd 0\n"
+ "\n"
+ " total frames transmitted 0\n"
+ " total bytes transmitted 0\n"
+ " total headroom inc 0\n"
+ " total encap on xmit 0\n"
+ "Device: %s\n"
+ "INGRESS priority mappings: 0:0 1:0 2:0 3:0 4:0 5:0 6:0 7:0\n"
+ "EGRESSS priority Mappings: \n",
+ vlan->vlan_dev, vlan->vid, vlan->trunk_dev);
+ set_proc_file("net/vlan", vlan->vlan_dev, ds_cstr(&ds));
+ ds_destroy(&ds);
+}
+
+/* Update /proc/net/vlan/config. */
+static void
+update_vlan_config(void)
+{
+ struct compat_vlan *vlan;
+ struct ds ds;
+
+ ds_init(&ds);
+ ds_put_cstr(&ds, "VLAN Dev name | VLAN ID\n"
+ "Name-Type: VLAN_NAME_TYPE_RAW_PLUS_VID_NO_PAD\n");
+ HMAP_FOR_EACH (vlan, struct compat_vlan, trunk_node, &vlans_by_trunk) {
+ ds_put_format(&ds, "%-15s| %d | %s\n",
+ vlan->vlan_dev, vlan->vid, vlan->trunk_dev);
+ }
+ set_proc_file("net/vlan", "config", ds_cstr(&ds));
+ ds_destroy(&ds);
+}
diff --git a/vswitchd/proc-net-compat.h b/vswitchd/proc-net-compat.h
new file mode 100644
index 00000000..ce97176b
--- /dev/null
+++ b/vswitchd/proc-net-compat.h
@@ -0,0 +1,51 @@
+/* Copyright (c) 2009 Nicira Networks
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * In addition, as a special exception, Nicira Networks gives permission
+ * to link the code of its release of vswitchd with the OpenSSL project's
+ * "OpenSSL" library (or with modified versions of it that use the same
+ * license as the "OpenSSL" library), and distribute the linked
+ * executables. You must obey the GNU General Public License in all
+ * respects for all of the code used other than "OpenSSL". If you modify
+ * this file, you may extend this exception to your version of the file,
+ * but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version.
+ */
+
+#ifndef VSWITCHD_PROC_NET_COMPAT_H
+#define VSWITCHD_PROC_NET_COMPAT_H 1
+
+#include "packets.h"
+
+struct compat_bond {
+ bool up;
+ int updelay;
+ int downdelay;
+ int n_slaves;
+ struct compat_bond_slave *slaves;
+};
+
+struct compat_bond_slave {
+ const char *name;
+ bool up;
+ uint8_t mac[ETH_ADDR_LEN];
+};
+
+int proc_net_compat_init(void);
+void proc_net_compat_update_bond(const char *name, const struct compat_bond *);
+void proc_net_compat_update_vlan(const char *dev, const char *vlandev,
+ int vlan);
+
+#endif /* vswitchd/proc-net-compat.h */
diff --git a/vswitchd/xenserver.c b/vswitchd/xenserver.c
new file mode 100644
index 00000000..7a8d255f
--- /dev/null
+++ b/vswitchd/xenserver.c
@@ -0,0 +1,90 @@
+/* Copyright (c) 2009 Nicira Networks
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * In addition, as a special exception, Nicira Networks gives permission
+ * to link the code of its release of vswitchd with the OpenSSL project's
+ * "OpenSSL" library (or with modified versions of it that use the same
+ * license as the "OpenSSL" library), and distribute the linked
+ * executables. You must obey the GNU General Public License in all
+ * respects for all of the code used other than "OpenSSL". If you modify
+ * this file, you may extend this exception to your version of the file,
+ * but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version.
+ */
+
+#include <config.h>
+#include "xenserver.h"
+#include <ctype.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include "dynamic-string.h"
+#include "process.h"
+
+#include "vlog.h"
+#define THIS_MODULE VLM_xenserver
+
+static char *
+read_host_uuid(void)
+{
+ static const char filename[] = "/etc/xensource-inventory";
+ char line[128];
+ FILE *file;
+
+ file = fopen(filename, "r");
+ if (!file) {
+ if (errno == ENOENT) {
+ VLOG_INFO("not running on a XenServer");
+ } else {
+ VLOG_INFO("%s: open: %s", filename, strerror(errno));
+ }
+ return NULL;
+ }
+
+ while (fgets(line, sizeof line, file)) {
+ static const char leader[] = "INSTALLATION_UUID='";
+ const int leader_len = strlen(leader);
+ const int uuid_len = 36;
+ static const char trailer[] = "'\n";
+ const int trailer_len = strlen(trailer);
+
+ if (strlen(line) == leader_len + uuid_len + trailer_len
+ && !memcmp(line, leader, leader_len)
+ && !memcmp(line + leader_len + uuid_len, trailer, trailer_len)) {
+ char *host_uuid = xmemdup0(line + leader_len, uuid_len);
+ VLOG_INFO("running on XenServer, host-uuid %s", host_uuid);
+ fclose(file);
+ return host_uuid;
+ }
+ }
+ fclose(file);
+ VLOG_ERR("%s: INSTALLATION_UUID not found", filename);
+ return NULL;
+}
+
+const char *
+xenserver_get_host_uuid(void)
+{
+ static char *host_uuid;
+ static bool inited;
+
+ if (!inited) {
+ host_uuid = read_host_uuid();
+ inited = true;
+ }
+ return host_uuid;
+}
+
diff --git a/vswitchd/xenserver.h b/vswitchd/xenserver.h
new file mode 100644
index 00000000..c69b133a
--- /dev/null
+++ b/vswitchd/xenserver.h
@@ -0,0 +1,32 @@
+/* Copyright (c) 2009 Nicira Networks
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * In addition, as a special exception, Nicira Networks gives permission
+ * to link the code of its release of vswitchd with the OpenSSL project's
+ * "OpenSSL" library (or with modified versions of it that use the same
+ * license as the "OpenSSL" library), and distribute the linked
+ * executables. You must obey the GNU General Public License in all
+ * respects for all of the code used other than "OpenSSL". If you modify
+ * this file, you may extend this exception to your version of the file,
+ * but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version.
+ */
+
+#ifndef VSWITCHD_XENSERVER_H
+#define VSWITCHD_XENSERVER_H 1
+
+const char *xenserver_get_host_uuid(void);
+
+#endif /* xenserver.h */