aboutsummaryrefslogtreecommitdiff
path: root/lib/reconnect.c
diff options
context:
space:
mode:
authorBen Pfaff <blp@nicira.com>2009-11-10 15:30:49 -0800
committerBen Pfaff <blp@nicira.com>2009-11-12 12:56:21 -0800
commit3ed497fc10033c9857140270d60ef6aa2d7c0c08 (patch)
tree13fc8d25e8023b1a33c2b003e35c53e963b80423 /lib/reconnect.c
parent8ecd53084ce42e2e868a9c24c57b2d9a7e59cfc2 (diff)
New "reconnect" library for managing network connection attempts.
This library implements the reconnection FSM used by the "rconn" library. Therefore, it makes sense to change rconn to use this, and I have a patch to do that, but I am not applying it at the moment to avoid changing unrelated code on the "db" branch.
Diffstat (limited to 'lib/reconnect.c')
-rw-r--r--lib/reconnect.c523
1 files changed, 523 insertions, 0 deletions
diff --git a/lib/reconnect.c b/lib/reconnect.c
new file mode 100644
index 00000000..fadeeb89
--- /dev/null
+++ b/lib/reconnect.c
@@ -0,0 +1,523 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <config.h>
+#include "reconnect.h"
+
+#include <assert.h>
+#include <stdlib.h>
+
+#include "poll-loop.h"
+
+#define THIS_MODULE VLM_reconnect
+#include "vlog.h"
+
+#define STATES \
+ STATE(VOID, 1 << 0) \
+ STATE(BACKOFF, 1 << 1) \
+ STATE(CONNECTING, 1 << 2) \
+ STATE(ACTIVE, 1 << 3) \
+ STATE(IDLE, 1 << 4) \
+ STATE(RECONNECT, 1 << 5)
+enum state {
+#define STATE(NAME, VALUE) S_##NAME = VALUE,
+ STATES
+#undef STATE
+};
+
+static bool
+is_connected_state(enum state state)
+{
+ return (state & (S_ACTIVE | S_IDLE)) != 0;
+}
+
+struct reconnect {
+ /* Configuration. */
+ char *name;
+ int min_backoff;
+ int max_backoff;
+ int probe_interval;
+
+ /* State. */
+ enum state state;
+ long long int state_entered;
+ int backoff;
+ long long int last_received;
+ long long int last_connected;
+
+ /* These values are simply for statistics reporting, not otherwise used
+ * directly by anything internal. */
+ long long int creation_time;
+ unsigned int n_attempted_connections, n_successful_connections;
+ unsigned int total_connected_duration;
+ unsigned int seqno;
+};
+
+static void reconnect_transition__(struct reconnect *, long long int now,
+ enum state state);
+static long long int reconnect_deadline__(const struct reconnect *);
+
+static const char *
+reconnect_state_name__(enum state state)
+{
+ switch (state) {
+#define STATE(NAME, VALUE) case S_##NAME: return #NAME;
+ STATES
+#undef STATE
+ }
+ return "***ERROR***";
+}
+
+/* Creates and returns a new reconnect FSM with default settings. The FSM is
+ * initially disabled. The caller will likely want to call reconnect_enable()
+ * and reconnect_set_name() on the returned object. */
+struct reconnect *
+reconnect_create(long long int now)
+{
+ struct reconnect *fsm = xzalloc(sizeof *fsm);
+
+ fsm->name = xstrdup("void");
+ fsm->min_backoff = 1000;
+ fsm->max_backoff = 8000;
+ fsm->probe_interval = 5000;
+
+ fsm->state = S_VOID;
+ fsm->state_entered = now;
+ fsm->backoff = 0;
+ fsm->last_received = now;
+ fsm->last_connected = now;
+ fsm->creation_time = now;
+
+ return fsm;
+}
+
+/* Frees 'fsm'. */
+void
+reconnect_destroy(struct reconnect *fsm)
+{
+ if (fsm) {
+ free(fsm->name);
+ free(fsm);
+ }
+}
+
+/* Returns 'fsm''s name. */
+const char *
+reconnect_get_name(const struct reconnect *fsm)
+{
+ return fsm->name;
+}
+
+/* Sets 'fsm''s name to 'name'. If 'name' is null, then "void" is used
+ * instead.
+ *
+ * The name set for 'fsm' is used in log messages. */
+void
+reconnect_set_name(struct reconnect *fsm, const char *name)
+{
+ free(fsm->name);
+ fsm->name = xstrdup(name ? name : "void");
+}
+
+/* Return the minimum number of milliseconds to back off between consecutive
+ * connection attempts. The default is 1000 ms. */
+int
+reconnect_get_min_backoff(const struct reconnect *fsm)
+{
+ return fsm->min_backoff;
+}
+
+/* Return the maximum number of milliseconds to back off between consecutive
+ * connection attempts. The default is 8000 ms. */
+int
+reconnect_get_max_backoff(const struct reconnect *fsm)
+{
+ return fsm->max_backoff;
+}
+
+/* Returns the "probe interval" for 'fsm' in milliseconds. If this is zero, it
+ * disables the connection keepalive feature. If it is nonzero, then if the
+ * interval passes while 'fsm' is connected and without reconnect_received()
+ * being called for 'fsm', reconnect_run() returns RECONNECT_PROBE. If the
+ * interval passes again without reconnect_received() being called,
+ * reconnect_run() returns RECONNECT_DISCONNECT for 'fsm'. */
+int
+reconnect_get_probe_interval(const struct reconnect *fsm)
+{
+ return fsm->probe_interval;
+}
+
+/* Configures the backoff parameters for 'fsm'. 'min_backoff' is the minimum
+ * number of milliseconds, and 'max_backoff' is the maximum, between connection
+ * attempts.
+ *
+ * 'min_backoff' must be at least 1000, and 'max_backoff' must be greater than
+ * or equal to 'min_backoff'. */
+void
+reconnect_set_backoff(struct reconnect *fsm, int min_backoff, int max_backoff)
+{
+ fsm->min_backoff = MAX(min_backoff, 1000);
+ fsm->max_backoff = max_backoff ? MAX(max_backoff, 1000) : 8000;
+ if (fsm->min_backoff > fsm->max_backoff) {
+ fsm->max_backoff = fsm->min_backoff;
+ }
+
+ if (fsm->state == S_BACKOFF && fsm->backoff > max_backoff) {
+ fsm->backoff = max_backoff;
+ }
+}
+
+/* Sets the "probe interval" for 'fsm' to 'probe_interval', in milliseconds.
+ * If this is zero, it disables the connection keepalive feature. If it is
+ * nonzero, then if the interval passes while 'fsm' is connected and without
+ * reconnect_received() being called for 'fsm', reconnect_run() returns
+ * RECONNECT_PROBE. If the interval passes again without reconnect_received()
+ * being called, reconnect_run() returns RECONNECT_DISCONNECT for 'fsm'.
+ *
+ * If 'probe_interval' is nonzero, then it will be forced to a value of at
+ * least 1000 ms. */
+void
+reconnect_set_probe_interval(struct reconnect *fsm, int probe_interval)
+{
+ fsm->probe_interval = probe_interval ? MAX(1000, probe_interval) : 0;
+}
+
+/* Returns true if 'fsm' has been enabled with reconnect_enable(). Calling
+ * another function that indicates a change in connection state, such as
+ * reconnect_disconnected() or reconnect_force_reconnect(), will also enable
+ * a reconnect FSM. */
+bool
+reconnect_is_enabled(const struct reconnect *fsm)
+{
+ return fsm->state != S_VOID;
+}
+
+/* If 'fsm' is disabled (the default for newly created FSMs), enables it, so
+ * that the next call to reconnect_run() for 'fsm' will return
+ * RECONNECT_CONNECT.
+ *
+ * If 'fsm' is not disabled, this function has no effect. */
+void
+reconnect_enable(struct reconnect *fsm, long long int now)
+{
+ if (fsm->state == S_VOID) {
+ reconnect_transition__(fsm, now, S_BACKOFF);
+ fsm->backoff = 0;
+ }
+}
+
+/* Disables 'fsm'. Until 'fsm' is enabled again, reconnect_run() will always
+ * return 0. */
+void
+reconnect_disable(struct reconnect *fsm, long long int now)
+{
+ if (fsm->state != S_VOID) {
+ reconnect_transition__(fsm, now, S_VOID);
+ }
+}
+
+/* If 'fsm' is enabled and currently connected (or attempting to connect),
+ * forces reconnect_run() for 'fsm' to return RECONNECT_DISCONNECT the next
+ * time it is called, which should cause the client to drop the connection (or
+ * attempt), back off, and then reconnect. */
+void
+reconnect_force_reconnect(struct reconnect *fsm, long long int now)
+{
+ if (fsm->state & (S_CONNECTING | S_ACTIVE | S_IDLE)) {
+ reconnect_transition__(fsm, now, S_RECONNECT);
+ }
+}
+
+/* Tell 'fsm' that the connection dropped or that a connection attempt failed.
+ * 'error' specifies the reason: a positive value represents an errno value,
+ * EOF indicates that the connection was closed by the peer (e.g. read()
+ * returned 0), and 0 indicates no specific error.
+ *
+ * The FSM will back off, then reconnect. */
+void
+reconnect_disconnected(struct reconnect *fsm, long long int now, int error)
+{
+ if (fsm->state != S_BACKOFF) {
+ /* Report what happened. */
+ if (fsm->state & (S_ACTIVE | S_IDLE)) {
+ if (error > 0) {
+ VLOG_WARN("%s: connection dropped (%s)",
+ fsm->name, strerror(error));
+ } else if (error == EOF) {
+ VLOG_INFO("%s: connection closed by peer", fsm->name);
+ } else {
+ VLOG_INFO("%s: connection dropped", fsm->name);
+ }
+ } else {
+ if (error > 0) {
+ VLOG_WARN("%s: connection attempt failed (%s)",
+ fsm->name, strerror(error));
+ } else {
+ VLOG_INFO("%s: connection attempt timed out", fsm->name);
+ }
+ }
+
+ /* Back off. */
+ if (fsm->state & (S_ACTIVE | S_IDLE)
+ && fsm->last_received - fsm->last_connected >= fsm->backoff) {
+ fsm->backoff = fsm->min_backoff;
+ } else {
+ if (fsm->backoff < fsm->min_backoff) {
+ fsm->backoff = fsm->min_backoff;
+ } else if (fsm->backoff >= fsm->max_backoff / 2) {
+ fsm->backoff = fsm->max_backoff;
+ } else {
+ fsm->backoff *= 2;
+ }
+ VLOG_INFO("%s: waiting %.3g seconds before reconnect\n",
+ fsm->name, fsm->backoff / 1000.0);
+ }
+ reconnect_transition__(fsm, now, S_BACKOFF);
+ }
+}
+
+/* Tell 'fsm' that a connection attempt is in progress.
+ *
+ * The FSM will start a timer, after which the connection attempt will be
+ * aborted (by returning RECONNECT_DISCONNECT from reconect_run()). */
+void
+reconnect_connecting(struct reconnect *fsm, long long int now)
+{
+ if (fsm->state != S_CONNECTING) {
+ VLOG_INFO("%s: connecting...", fsm->name);
+ reconnect_transition__(fsm, now, S_CONNECTING);
+ }
+}
+
+/* Tell 'fsm' that the connection was successful.
+ *
+ * The FSM will start the probe interval timer, which is reset by
+ * reconnect_received(). If the timer expires, a probe will be sent (by
+ * returning RECONNECT_PROBE from reconnect_run()). If the timer expires
+ * again without being reset, the connection will be aborted (by returning
+ * RECONNECT_DISCONNECT from reconnect_run()). */
+void
+reconnect_connected(struct reconnect *fsm, long long int now)
+{
+ if (!is_connected_state(fsm->state)) {
+ reconnect_connecting(fsm, now);
+
+ VLOG_INFO("%s: connected", fsm->name);
+ reconnect_transition__(fsm, now, S_ACTIVE);
+ fsm->last_connected = now;
+ }
+}
+
+/* Tell 'fsm' that the connection attempt failed.
+ *
+ * The FSM will back off and attempt to reconnect. */
+void
+reconnect_connect_failed(struct reconnect *fsm, long long int now, int error)
+{
+ reconnect_connecting(fsm, now);
+ reconnect_disconnected(fsm, now, error);
+}
+
+/* Tell 'fsm' that some data was received. This resets the probe interval
+ * timer, so that the connection is known not to be idle. */
+void
+reconnect_received(struct reconnect *fsm, long long int now)
+{
+ if (fsm->state != S_ACTIVE) {
+ reconnect_transition__(fsm, now, S_ACTIVE);
+ }
+ fsm->last_received = now;
+}
+
+static void
+reconnect_transition__(struct reconnect *fsm, long long int now,
+ enum state state)
+{
+ if (fsm->state == S_CONNECTING) {
+ fsm->n_attempted_connections++;
+ if (state == S_ACTIVE) {
+ fsm->n_successful_connections++;
+ }
+ }
+ if (is_connected_state(fsm->state) != is_connected_state(state)) {
+ if (is_connected_state(fsm->state)) {
+ fsm->total_connected_duration += now - fsm->last_connected;
+ }
+ fsm->seqno++;
+ }
+
+ VLOG_DBG("%s: entering %s", fsm->name, reconnect_state_name__(state));
+ fsm->state = state;
+ fsm->state_entered = now;
+}
+
+static long long int
+reconnect_deadline__(const struct reconnect *fsm)
+{
+ assert(fsm->state_entered != LLONG_MIN);
+ switch (fsm->state) {
+ case S_VOID:
+ return LLONG_MAX;
+
+ case S_BACKOFF:
+ return fsm->state_entered + fsm->backoff;
+
+ case S_CONNECTING:
+ return fsm->state_entered + MAX(1000, fsm->backoff);
+
+ case S_ACTIVE:
+ if (fsm->probe_interval) {
+ long long int base = MAX(fsm->last_received, fsm->state_entered);
+ return base + fsm->probe_interval;
+ }
+ return LLONG_MAX;
+
+ case S_IDLE:
+ return fsm->state_entered + fsm->probe_interval;
+
+ case S_RECONNECT:
+ return fsm->state_entered;
+ }
+
+ NOT_REACHED();
+}
+
+/* Assesses whether any action should be taken on 'fsm'. The return value is
+ * one of:
+ *
+ * - 0: The client need not take any action.
+ *
+ * - RECONNECT_CONNECT: The client should start a connection attempt and
+ * indicate this by calling reconnect_connecting(). If the connection
+ * attempt has definitely succeeded, it should call
+ * reconnect_connected(). If the connection attempt has definitely
+ * failed, it should call reconnect_connect_failed().
+ *
+ * The FSM is smart enough to back off correctly after successful
+ * connections that quickly abort, so it is OK to call
+ * reconnect_connected() after a low-level successful connection
+ * (e.g. connect()) even if the connection might soon abort due to a
+ * failure at a high-level (e.g. SSL negotiation failure).
+ *
+ * - RECONNECT_DISCONNECT: The client should abort the current connection
+ * or connection attempt and call reconnect_disconnected() or
+ * reconnect_connect_failed() to indicate it.
+ *
+ * - RECONNECT_PROBE: The client should send some kind of request to the
+ * peer that will elicit a response, to ensure that the connection is
+ * indeed in working order. (This will only be returned if the "probe
+ * interval" is nonzero--see reconnect_set_probe_interval()).
+ */
+enum reconnect_action
+reconnect_run(struct reconnect *fsm, long long int now)
+{
+ if (now >= reconnect_deadline__(fsm)) {
+ switch (fsm->state) {
+ case S_VOID:
+ return 0;
+
+ case S_BACKOFF:
+ return RECONNECT_CONNECT;
+
+ case S_CONNECTING:
+ return RECONNECT_DISCONNECT;
+
+ case S_ACTIVE:
+ VLOG_DBG("%s: idle %lld ms, sending inactivity probe", fsm->name,
+ now - MAX(fsm->last_received, fsm->state_entered));
+ reconnect_transition__(fsm, now, S_IDLE);
+ return RECONNECT_PROBE;
+
+ case S_IDLE:
+ VLOG_ERR("%s: no response to inactivity probe after %.3g "
+ "seconds, disconnecting",
+ fsm->name, (now - fsm->state_entered) / 1000.0);
+ return RECONNECT_DISCONNECT;
+
+ case S_RECONNECT:
+ return RECONNECT_DISCONNECT;
+ }
+
+ NOT_REACHED();
+ } else {
+ return fsm->state == S_CONNECTING ? RECONNECT_CONNECT : 0;
+ }
+}
+
+/* Causes the next call to poll_block() to wake up when reconnect_run() should
+ * be called on 'fsm'. */
+void
+reconnect_wait(struct reconnect *fsm, long long int now)
+{
+ int timeout = reconnect_timeout(fsm, now);
+ if (timeout >= 0) {
+ poll_timer_wait(timeout);
+ }
+}
+
+/* Returns the number of milliseconds after which reconnect_run() should be
+ * called on 'fsm' if nothing else notable happens in the meantime, or a
+ * negative number if this is currently unnecessary. */
+int
+reconnect_timeout(struct reconnect *fsm, long long int now)
+{
+ long long int deadline = reconnect_deadline__(fsm);
+ if (deadline != LLONG_MAX) {
+ long long int remaining = deadline - now;
+ return MAX(0, MIN(INT_MAX, remaining));
+ }
+ return -1;
+}
+
+/* Returns true if 'fsm' is currently believed to be connected, that is, if
+ * reconnect_connected() was called more recently than any call to
+ * reconnect_connect_failed() or reconnect_disconnected() or
+ * reconnect_disable(), and false otherwise. */
+bool
+reconnect_is_connected(const struct reconnect *fsm)
+{
+ return is_connected_state(fsm->state);
+}
+
+/* Returns the number of milliseconds for which 'fsm' has been continuously
+ * connected to its peer. (If 'fsm' is not currently connected, this is 0.) */
+unsigned int
+reconnect_get_connection_duration(const struct reconnect *fsm,
+ long long int now)
+{
+ return reconnect_is_connected(fsm) ? now - fsm->last_connected : 0;
+}
+
+/* Copies various statistics for 'fsm' into '*stats'. */
+void
+reconnect_get_stats(const struct reconnect *fsm, long long int now,
+ struct reconnect_stats *stats)
+{
+ stats->creation_time = fsm->creation_time;
+ stats->last_received = fsm->last_received;
+ stats->last_connected = fsm->last_connected;
+ stats->backoff = fsm->backoff;
+ stats->seqno = fsm->seqno;
+ stats->is_connected = reconnect_is_connected(fsm);
+ stats->current_connection_duration
+ = reconnect_get_connection_duration(fsm, now);
+ stats->total_connected_duration = (stats->current_connection_duration
+ + fsm->total_connected_duration);
+ stats->n_attempted_connections = fsm->n_attempted_connections;
+ stats->n_successful_connections = fsm->n_successful_connections;
+ stats->state = reconnect_state_name__(fsm->state);
+ stats->state_elapsed = now - fsm->state_entered;
+}