diff options
author | Ben Pfaff <blp@nicira.com> | 2009-11-10 15:30:49 -0800 |
---|---|---|
committer | Ben Pfaff <blp@nicira.com> | 2009-11-12 12:56:21 -0800 |
commit | 3ed497fc10033c9857140270d60ef6aa2d7c0c08 (patch) | |
tree | 13fc8d25e8023b1a33c2b003e35c53e963b80423 /lib/reconnect.c | |
parent | 8ecd53084ce42e2e868a9c24c57b2d9a7e59cfc2 (diff) |
New "reconnect" library for managing network connection attempts.
This library implements the reconnection FSM used by the "rconn" library.
Therefore, it makes sense to change rconn to use this, and I have a patch
to do that, but I am not applying it at the moment to avoid changing unrelated
code on the "db" branch.
Diffstat (limited to 'lib/reconnect.c')
-rw-r--r-- | lib/reconnect.c | 523 |
1 files changed, 523 insertions, 0 deletions
diff --git a/lib/reconnect.c b/lib/reconnect.c new file mode 100644 index 00000000..fadeeb89 --- /dev/null +++ b/lib/reconnect.c @@ -0,0 +1,523 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <config.h> +#include "reconnect.h" + +#include <assert.h> +#include <stdlib.h> + +#include "poll-loop.h" + +#define THIS_MODULE VLM_reconnect +#include "vlog.h" + +#define STATES \ + STATE(VOID, 1 << 0) \ + STATE(BACKOFF, 1 << 1) \ + STATE(CONNECTING, 1 << 2) \ + STATE(ACTIVE, 1 << 3) \ + STATE(IDLE, 1 << 4) \ + STATE(RECONNECT, 1 << 5) +enum state { +#define STATE(NAME, VALUE) S_##NAME = VALUE, + STATES +#undef STATE +}; + +static bool +is_connected_state(enum state state) +{ + return (state & (S_ACTIVE | S_IDLE)) != 0; +} + +struct reconnect { + /* Configuration. */ + char *name; + int min_backoff; + int max_backoff; + int probe_interval; + + /* State. */ + enum state state; + long long int state_entered; + int backoff; + long long int last_received; + long long int last_connected; + + /* These values are simply for statistics reporting, not otherwise used + * directly by anything internal. */ + long long int creation_time; + unsigned int n_attempted_connections, n_successful_connections; + unsigned int total_connected_duration; + unsigned int seqno; +}; + +static void reconnect_transition__(struct reconnect *, long long int now, + enum state state); +static long long int reconnect_deadline__(const struct reconnect *); + +static const char * +reconnect_state_name__(enum state state) +{ + switch (state) { +#define STATE(NAME, VALUE) case S_##NAME: return #NAME; + STATES +#undef STATE + } + return "***ERROR***"; +} + +/* Creates and returns a new reconnect FSM with default settings. The FSM is + * initially disabled. The caller will likely want to call reconnect_enable() + * and reconnect_set_name() on the returned object. */ +struct reconnect * +reconnect_create(long long int now) +{ + struct reconnect *fsm = xzalloc(sizeof *fsm); + + fsm->name = xstrdup("void"); + fsm->min_backoff = 1000; + fsm->max_backoff = 8000; + fsm->probe_interval = 5000; + + fsm->state = S_VOID; + fsm->state_entered = now; + fsm->backoff = 0; + fsm->last_received = now; + fsm->last_connected = now; + fsm->creation_time = now; + + return fsm; +} + +/* Frees 'fsm'. */ +void +reconnect_destroy(struct reconnect *fsm) +{ + if (fsm) { + free(fsm->name); + free(fsm); + } +} + +/* Returns 'fsm''s name. */ +const char * +reconnect_get_name(const struct reconnect *fsm) +{ + return fsm->name; +} + +/* Sets 'fsm''s name to 'name'. If 'name' is null, then "void" is used + * instead. + * + * The name set for 'fsm' is used in log messages. */ +void +reconnect_set_name(struct reconnect *fsm, const char *name) +{ + free(fsm->name); + fsm->name = xstrdup(name ? name : "void"); +} + +/* Return the minimum number of milliseconds to back off between consecutive + * connection attempts. The default is 1000 ms. */ +int +reconnect_get_min_backoff(const struct reconnect *fsm) +{ + return fsm->min_backoff; +} + +/* Return the maximum number of milliseconds to back off between consecutive + * connection attempts. The default is 8000 ms. */ +int +reconnect_get_max_backoff(const struct reconnect *fsm) +{ + return fsm->max_backoff; +} + +/* Returns the "probe interval" for 'fsm' in milliseconds. If this is zero, it + * disables the connection keepalive feature. If it is nonzero, then if the + * interval passes while 'fsm' is connected and without reconnect_received() + * being called for 'fsm', reconnect_run() returns RECONNECT_PROBE. If the + * interval passes again without reconnect_received() being called, + * reconnect_run() returns RECONNECT_DISCONNECT for 'fsm'. */ +int +reconnect_get_probe_interval(const struct reconnect *fsm) +{ + return fsm->probe_interval; +} + +/* Configures the backoff parameters for 'fsm'. 'min_backoff' is the minimum + * number of milliseconds, and 'max_backoff' is the maximum, between connection + * attempts. + * + * 'min_backoff' must be at least 1000, and 'max_backoff' must be greater than + * or equal to 'min_backoff'. */ +void +reconnect_set_backoff(struct reconnect *fsm, int min_backoff, int max_backoff) +{ + fsm->min_backoff = MAX(min_backoff, 1000); + fsm->max_backoff = max_backoff ? MAX(max_backoff, 1000) : 8000; + if (fsm->min_backoff > fsm->max_backoff) { + fsm->max_backoff = fsm->min_backoff; + } + + if (fsm->state == S_BACKOFF && fsm->backoff > max_backoff) { + fsm->backoff = max_backoff; + } +} + +/* Sets the "probe interval" for 'fsm' to 'probe_interval', in milliseconds. + * If this is zero, it disables the connection keepalive feature. If it is + * nonzero, then if the interval passes while 'fsm' is connected and without + * reconnect_received() being called for 'fsm', reconnect_run() returns + * RECONNECT_PROBE. If the interval passes again without reconnect_received() + * being called, reconnect_run() returns RECONNECT_DISCONNECT for 'fsm'. + * + * If 'probe_interval' is nonzero, then it will be forced to a value of at + * least 1000 ms. */ +void +reconnect_set_probe_interval(struct reconnect *fsm, int probe_interval) +{ + fsm->probe_interval = probe_interval ? MAX(1000, probe_interval) : 0; +} + +/* Returns true if 'fsm' has been enabled with reconnect_enable(). Calling + * another function that indicates a change in connection state, such as + * reconnect_disconnected() or reconnect_force_reconnect(), will also enable + * a reconnect FSM. */ +bool +reconnect_is_enabled(const struct reconnect *fsm) +{ + return fsm->state != S_VOID; +} + +/* If 'fsm' is disabled (the default for newly created FSMs), enables it, so + * that the next call to reconnect_run() for 'fsm' will return + * RECONNECT_CONNECT. + * + * If 'fsm' is not disabled, this function has no effect. */ +void +reconnect_enable(struct reconnect *fsm, long long int now) +{ + if (fsm->state == S_VOID) { + reconnect_transition__(fsm, now, S_BACKOFF); + fsm->backoff = 0; + } +} + +/* Disables 'fsm'. Until 'fsm' is enabled again, reconnect_run() will always + * return 0. */ +void +reconnect_disable(struct reconnect *fsm, long long int now) +{ + if (fsm->state != S_VOID) { + reconnect_transition__(fsm, now, S_VOID); + } +} + +/* If 'fsm' is enabled and currently connected (or attempting to connect), + * forces reconnect_run() for 'fsm' to return RECONNECT_DISCONNECT the next + * time it is called, which should cause the client to drop the connection (or + * attempt), back off, and then reconnect. */ +void +reconnect_force_reconnect(struct reconnect *fsm, long long int now) +{ + if (fsm->state & (S_CONNECTING | S_ACTIVE | S_IDLE)) { + reconnect_transition__(fsm, now, S_RECONNECT); + } +} + +/* Tell 'fsm' that the connection dropped or that a connection attempt failed. + * 'error' specifies the reason: a positive value represents an errno value, + * EOF indicates that the connection was closed by the peer (e.g. read() + * returned 0), and 0 indicates no specific error. + * + * The FSM will back off, then reconnect. */ +void +reconnect_disconnected(struct reconnect *fsm, long long int now, int error) +{ + if (fsm->state != S_BACKOFF) { + /* Report what happened. */ + if (fsm->state & (S_ACTIVE | S_IDLE)) { + if (error > 0) { + VLOG_WARN("%s: connection dropped (%s)", + fsm->name, strerror(error)); + } else if (error == EOF) { + VLOG_INFO("%s: connection closed by peer", fsm->name); + } else { + VLOG_INFO("%s: connection dropped", fsm->name); + } + } else { + if (error > 0) { + VLOG_WARN("%s: connection attempt failed (%s)", + fsm->name, strerror(error)); + } else { + VLOG_INFO("%s: connection attempt timed out", fsm->name); + } + } + + /* Back off. */ + if (fsm->state & (S_ACTIVE | S_IDLE) + && fsm->last_received - fsm->last_connected >= fsm->backoff) { + fsm->backoff = fsm->min_backoff; + } else { + if (fsm->backoff < fsm->min_backoff) { + fsm->backoff = fsm->min_backoff; + } else if (fsm->backoff >= fsm->max_backoff / 2) { + fsm->backoff = fsm->max_backoff; + } else { + fsm->backoff *= 2; + } + VLOG_INFO("%s: waiting %.3g seconds before reconnect\n", + fsm->name, fsm->backoff / 1000.0); + } + reconnect_transition__(fsm, now, S_BACKOFF); + } +} + +/* Tell 'fsm' that a connection attempt is in progress. + * + * The FSM will start a timer, after which the connection attempt will be + * aborted (by returning RECONNECT_DISCONNECT from reconect_run()). */ +void +reconnect_connecting(struct reconnect *fsm, long long int now) +{ + if (fsm->state != S_CONNECTING) { + VLOG_INFO("%s: connecting...", fsm->name); + reconnect_transition__(fsm, now, S_CONNECTING); + } +} + +/* Tell 'fsm' that the connection was successful. + * + * The FSM will start the probe interval timer, which is reset by + * reconnect_received(). If the timer expires, a probe will be sent (by + * returning RECONNECT_PROBE from reconnect_run()). If the timer expires + * again without being reset, the connection will be aborted (by returning + * RECONNECT_DISCONNECT from reconnect_run()). */ +void +reconnect_connected(struct reconnect *fsm, long long int now) +{ + if (!is_connected_state(fsm->state)) { + reconnect_connecting(fsm, now); + + VLOG_INFO("%s: connected", fsm->name); + reconnect_transition__(fsm, now, S_ACTIVE); + fsm->last_connected = now; + } +} + +/* Tell 'fsm' that the connection attempt failed. + * + * The FSM will back off and attempt to reconnect. */ +void +reconnect_connect_failed(struct reconnect *fsm, long long int now, int error) +{ + reconnect_connecting(fsm, now); + reconnect_disconnected(fsm, now, error); +} + +/* Tell 'fsm' that some data was received. This resets the probe interval + * timer, so that the connection is known not to be idle. */ +void +reconnect_received(struct reconnect *fsm, long long int now) +{ + if (fsm->state != S_ACTIVE) { + reconnect_transition__(fsm, now, S_ACTIVE); + } + fsm->last_received = now; +} + +static void +reconnect_transition__(struct reconnect *fsm, long long int now, + enum state state) +{ + if (fsm->state == S_CONNECTING) { + fsm->n_attempted_connections++; + if (state == S_ACTIVE) { + fsm->n_successful_connections++; + } + } + if (is_connected_state(fsm->state) != is_connected_state(state)) { + if (is_connected_state(fsm->state)) { + fsm->total_connected_duration += now - fsm->last_connected; + } + fsm->seqno++; + } + + VLOG_DBG("%s: entering %s", fsm->name, reconnect_state_name__(state)); + fsm->state = state; + fsm->state_entered = now; +} + +static long long int +reconnect_deadline__(const struct reconnect *fsm) +{ + assert(fsm->state_entered != LLONG_MIN); + switch (fsm->state) { + case S_VOID: + return LLONG_MAX; + + case S_BACKOFF: + return fsm->state_entered + fsm->backoff; + + case S_CONNECTING: + return fsm->state_entered + MAX(1000, fsm->backoff); + + case S_ACTIVE: + if (fsm->probe_interval) { + long long int base = MAX(fsm->last_received, fsm->state_entered); + return base + fsm->probe_interval; + } + return LLONG_MAX; + + case S_IDLE: + return fsm->state_entered + fsm->probe_interval; + + case S_RECONNECT: + return fsm->state_entered; + } + + NOT_REACHED(); +} + +/* Assesses whether any action should be taken on 'fsm'. The return value is + * one of: + * + * - 0: The client need not take any action. + * + * - RECONNECT_CONNECT: The client should start a connection attempt and + * indicate this by calling reconnect_connecting(). If the connection + * attempt has definitely succeeded, it should call + * reconnect_connected(). If the connection attempt has definitely + * failed, it should call reconnect_connect_failed(). + * + * The FSM is smart enough to back off correctly after successful + * connections that quickly abort, so it is OK to call + * reconnect_connected() after a low-level successful connection + * (e.g. connect()) even if the connection might soon abort due to a + * failure at a high-level (e.g. SSL negotiation failure). + * + * - RECONNECT_DISCONNECT: The client should abort the current connection + * or connection attempt and call reconnect_disconnected() or + * reconnect_connect_failed() to indicate it. + * + * - RECONNECT_PROBE: The client should send some kind of request to the + * peer that will elicit a response, to ensure that the connection is + * indeed in working order. (This will only be returned if the "probe + * interval" is nonzero--see reconnect_set_probe_interval()). + */ +enum reconnect_action +reconnect_run(struct reconnect *fsm, long long int now) +{ + if (now >= reconnect_deadline__(fsm)) { + switch (fsm->state) { + case S_VOID: + return 0; + + case S_BACKOFF: + return RECONNECT_CONNECT; + + case S_CONNECTING: + return RECONNECT_DISCONNECT; + + case S_ACTIVE: + VLOG_DBG("%s: idle %lld ms, sending inactivity probe", fsm->name, + now - MAX(fsm->last_received, fsm->state_entered)); + reconnect_transition__(fsm, now, S_IDLE); + return RECONNECT_PROBE; + + case S_IDLE: + VLOG_ERR("%s: no response to inactivity probe after %.3g " + "seconds, disconnecting", + fsm->name, (now - fsm->state_entered) / 1000.0); + return RECONNECT_DISCONNECT; + + case S_RECONNECT: + return RECONNECT_DISCONNECT; + } + + NOT_REACHED(); + } else { + return fsm->state == S_CONNECTING ? RECONNECT_CONNECT : 0; + } +} + +/* Causes the next call to poll_block() to wake up when reconnect_run() should + * be called on 'fsm'. */ +void +reconnect_wait(struct reconnect *fsm, long long int now) +{ + int timeout = reconnect_timeout(fsm, now); + if (timeout >= 0) { + poll_timer_wait(timeout); + } +} + +/* Returns the number of milliseconds after which reconnect_run() should be + * called on 'fsm' if nothing else notable happens in the meantime, or a + * negative number if this is currently unnecessary. */ +int +reconnect_timeout(struct reconnect *fsm, long long int now) +{ + long long int deadline = reconnect_deadline__(fsm); + if (deadline != LLONG_MAX) { + long long int remaining = deadline - now; + return MAX(0, MIN(INT_MAX, remaining)); + } + return -1; +} + +/* Returns true if 'fsm' is currently believed to be connected, that is, if + * reconnect_connected() was called more recently than any call to + * reconnect_connect_failed() or reconnect_disconnected() or + * reconnect_disable(), and false otherwise. */ +bool +reconnect_is_connected(const struct reconnect *fsm) +{ + return is_connected_state(fsm->state); +} + +/* Returns the number of milliseconds for which 'fsm' has been continuously + * connected to its peer. (If 'fsm' is not currently connected, this is 0.) */ +unsigned int +reconnect_get_connection_duration(const struct reconnect *fsm, + long long int now) +{ + return reconnect_is_connected(fsm) ? now - fsm->last_connected : 0; +} + +/* Copies various statistics for 'fsm' into '*stats'. */ +void +reconnect_get_stats(const struct reconnect *fsm, long long int now, + struct reconnect_stats *stats) +{ + stats->creation_time = fsm->creation_time; + stats->last_received = fsm->last_received; + stats->last_connected = fsm->last_connected; + stats->backoff = fsm->backoff; + stats->seqno = fsm->seqno; + stats->is_connected = reconnect_is_connected(fsm); + stats->current_connection_duration + = reconnect_get_connection_duration(fsm, now); + stats->total_connected_duration = (stats->current_connection_duration + + fsm->total_connected_duration); + stats->n_attempted_connections = fsm->n_attempted_connections; + stats->n_successful_connections = fsm->n_successful_connections; + stats->state = reconnect_state_name__(fsm->state); + stats->state_elapsed = now - fsm->state_entered; +} |