summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJason Tedor <jason@tedor.me>2016-07-07 14:44:03 -0400
committerGitHub <noreply@github.com>2016-07-07 14:44:03 -0400
commite86aa29f671d46788c7610691f900b2547b549ff (patch)
treef6d96e838aa7e8fff94bc5863fc630c1c64535af
parentd3f8329a3dbd182ff69f74d4e177fc7a4550f1af (diff)
Die with dignity
Today when a thread encounters a fatal unrecoverable error that threatens the stability of the JVM, Elasticsearch marches on. This includes out of memory errors, stack overflow errors and other errors that leave the JVM in a questionable state. Instead, the Elasticsearch JVM should die when these errors are encountered. This commit causes this to be the case. Relates #19272
-rw-r--r--core/build.gradle2
-rw-r--r--core/src/main/java/org/elasticsearch/bootstrap/Bootstrap.java6
-rw-r--r--core/src/main/java/org/elasticsearch/bootstrap/ElasticsearchUncaughtExceptionHandler.java94
-rw-r--r--core/src/main/java/org/elasticsearch/bootstrap/Security.java2
-rw-r--r--core/src/main/resources/org/elasticsearch/bootstrap/security.policy2
-rw-r--r--core/src/test/java/org/elasticsearch/bootstrap/ElasticsearchUncaughtExceptionHandlerTests.java152
-rw-r--r--distribution/licenses/securesm-1.0.jar.sha11
-rw-r--r--distribution/licenses/securesm-1.1.jar.sha11
-rw-r--r--docs/reference/migration/migrate_5_0/packaging.asciidoc8
-rw-r--r--docs/reference/setup.asciidoc2
-rw-r--r--docs/reference/setup/stopping.asciidoc58
-rw-r--r--test/framework/src/main/java/org/elasticsearch/bootstrap/BootstrapForTesting.java2
12 files changed, 325 insertions, 5 deletions
diff --git a/core/build.gradle b/core/build.gradle
index 513e998ebd..6fd8c62af3 100644
--- a/core/build.gradle
+++ b/core/build.gradle
@@ -56,7 +56,7 @@ dependencies {
compile "org.apache.lucene:lucene-spatial3d:${versions.lucene}"
compile "org.apache.lucene:lucene-suggest:${versions.lucene}"
- compile 'org.elasticsearch:securesm:1.0'
+ compile 'org.elasticsearch:securesm:1.1'
// utilities
compile 'net.sf.jopt-simple:jopt-simple:5.0.2'
diff --git a/core/src/main/java/org/elasticsearch/bootstrap/Bootstrap.java b/core/src/main/java/org/elasticsearch/bootstrap/Bootstrap.java
index 3d5347307d..bdec058b04 100644
--- a/core/src/main/java/org/elasticsearch/bootstrap/Bootstrap.java
+++ b/core/src/main/java/org/elasticsearch/bootstrap/Bootstrap.java
@@ -246,6 +246,12 @@ final class Bootstrap {
// fail if somebody replaced the lucene jars
checkLucene();
+ // install the default uncaught exception handler; must be done before security is
+ // initialized as we do not want to grant the runtime permission
+ // setDefaultUncaughtExceptionHandler
+ Thread.setDefaultUncaughtExceptionHandler(
+ new ElasticsearchUncaughtExceptionHandler(() -> Node.NODE_NAME_SETTING.get(settings)));
+
INSTANCE.setup(true, settings, environment);
INSTANCE.start();
diff --git a/core/src/main/java/org/elasticsearch/bootstrap/ElasticsearchUncaughtExceptionHandler.java b/core/src/main/java/org/elasticsearch/bootstrap/ElasticsearchUncaughtExceptionHandler.java
new file mode 100644
index 0000000000..405e919fab
--- /dev/null
+++ b/core/src/main/java/org/elasticsearch/bootstrap/ElasticsearchUncaughtExceptionHandler.java
@@ -0,0 +1,94 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.bootstrap;
+
+import org.apache.lucene.index.MergePolicy;
+import org.elasticsearch.common.SuppressForbidden;
+import org.elasticsearch.common.logging.ESLogger;
+import org.elasticsearch.common.logging.Loggers;
+
+import java.io.IOError;
+import java.util.Objects;
+import java.util.function.Supplier;
+
+class ElasticsearchUncaughtExceptionHandler implements Thread.UncaughtExceptionHandler {
+
+ private final Supplier<String> loggingPrefixSupplier;
+
+ ElasticsearchUncaughtExceptionHandler(final Supplier<String> loggingPrefixSupplier) {
+ this.loggingPrefixSupplier = Objects.requireNonNull(loggingPrefixSupplier);
+ }
+
+ @Override
+ public void uncaughtException(Thread t, Throwable e) {
+ if (isFatalUncaught(e)) {
+ try {
+ onFatalUncaught(t.getName(), e);
+ } finally {
+ // we use specific error codes in case the above notification failed, at least we
+ // will have some indication of the error bringing us down
+ if (e instanceof InternalError) {
+ halt(128);
+ } else if (e instanceof OutOfMemoryError) {
+ halt(127);
+ } else if (e instanceof StackOverflowError) {
+ halt(126);
+ } else if (e instanceof UnknownError) {
+ halt(125);
+ } else if (e instanceof IOError) {
+ halt(124);
+ } else {
+ halt(1);
+ }
+ }
+ } else {
+ onNonFatalUncaught(t.getName(), e);
+ }
+ }
+
+ // visible for testing
+ static boolean isFatalUncaught(Throwable e) {
+ return isFatalCause(e) || (e instanceof MergePolicy.MergeException && isFatalCause(e.getCause()));
+ }
+
+ private static boolean isFatalCause(Throwable cause) {
+ return cause instanceof Error;
+ }
+
+ // visible for testing
+ void onFatalUncaught(final String threadName, final Throwable t) {
+ final ESLogger logger = Loggers.getLogger(ElasticsearchUncaughtExceptionHandler.class, loggingPrefixSupplier.get());
+ logger.error("fatal error in thread [{}], exiting", t, threadName);
+ }
+
+ // visible for testing
+ void onNonFatalUncaught(final String threadName, final Throwable t) {
+ final ESLogger logger = Loggers.getLogger(ElasticsearchUncaughtExceptionHandler.class, loggingPrefixSupplier.get());
+ logger.warn("uncaught exception in thread [{}]", t, threadName);
+ }
+
+ // visible for testing
+ @SuppressForbidden(reason = "halt")
+ void halt(int status) {
+ // we halt to prevent shutdown hooks from running
+ Runtime.getRuntime().halt(status);
+ }
+
+}
diff --git a/core/src/main/java/org/elasticsearch/bootstrap/Security.java b/core/src/main/java/org/elasticsearch/bootstrap/Security.java
index c44b46c6b0..05d2c8c1bf 100644
--- a/core/src/main/java/org/elasticsearch/bootstrap/Security.java
+++ b/core/src/main/java/org/elasticsearch/bootstrap/Security.java
@@ -120,7 +120,7 @@ final class Security {
Policy.setPolicy(new ESPolicy(createPermissions(environment), getPluginPermissions(environment), filterBadDefaults));
// enable security manager
- System.setSecurityManager(new SecureSM());
+ System.setSecurityManager(new SecureSM(new String[] { "org.elasticsearch.bootstrap." }));
// do some basic tests
selfTest();
diff --git a/core/src/main/resources/org/elasticsearch/bootstrap/security.policy b/core/src/main/resources/org/elasticsearch/bootstrap/security.policy
index dcf37d3dd5..a4a45585b5 100644
--- a/core/src/main/resources/org/elasticsearch/bootstrap/security.policy
+++ b/core/src/main/resources/org/elasticsearch/bootstrap/security.policy
@@ -24,7 +24,7 @@
//// SecurityManager impl:
//// Must have all permissions to properly perform access checks
-grant codeBase "${codebase.securesm-1.0.jar}" {
+grant codeBase "${codebase.securesm-1.1.jar}" {
permission java.security.AllPermission;
};
diff --git a/core/src/test/java/org/elasticsearch/bootstrap/ElasticsearchUncaughtExceptionHandlerTests.java b/core/src/test/java/org/elasticsearch/bootstrap/ElasticsearchUncaughtExceptionHandlerTests.java
new file mode 100644
index 0000000000..e4ff83e9b4
--- /dev/null
+++ b/core/src/test/java/org/elasticsearch/bootstrap/ElasticsearchUncaughtExceptionHandlerTests.java
@@ -0,0 +1,152 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.bootstrap;
+
+import org.apache.lucene.index.MergePolicy;
+import org.elasticsearch.test.ESTestCase;
+import org.junit.Before;
+
+import java.io.IOError;
+import java.io.IOException;
+import java.io.UncheckedIOException;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicReference;
+
+import static org.hamcrest.CoreMatchers.equalTo;
+
+public class ElasticsearchUncaughtExceptionHandlerTests extends ESTestCase {
+
+ private Map<Class<? extends Error>, Integer> expectedStatus;
+
+ @Before
+ public void setUp() throws Exception {
+ super.setUp();
+ Map<Class<? extends Error>, Integer> expectedStatus = new HashMap<>();
+ expectedStatus.put(InternalError.class, 128);
+ expectedStatus.put(OutOfMemoryError.class, 127);
+ expectedStatus.put(StackOverflowError.class, 126);
+ expectedStatus.put(UnknownError.class, 125);
+ expectedStatus.put(IOError.class, 124);
+ this.expectedStatus = Collections.unmodifiableMap(expectedStatus);
+ }
+
+ public void testUncaughtError() throws InterruptedException {
+ final Error error = randomFrom(
+ new InternalError(),
+ new OutOfMemoryError(),
+ new StackOverflowError(),
+ new UnknownError(),
+ new IOError(new IOException("fatal")),
+ new Error() {});
+ final Thread thread = new Thread(() -> { throw error; });
+ final String name = randomAsciiOfLength(10);
+ thread.setName(name);
+ final AtomicBoolean halt = new AtomicBoolean();
+ final AtomicInteger observedStatus = new AtomicInteger();
+ final AtomicReference<String> threadNameReference = new AtomicReference<>();
+ final AtomicReference<Throwable> throwableReference = new AtomicReference<>();
+ thread.setUncaughtExceptionHandler(new ElasticsearchUncaughtExceptionHandler(() -> "testUncaughtError") {
+
+ @Override
+ void halt(int status) {
+ halt.set(true);
+ observedStatus.set(status);
+ }
+
+ @Override
+ void onFatalUncaught(String threadName, Throwable t) {
+ threadNameReference.set(threadName);
+ throwableReference.set(t);
+ }
+
+ @Override
+ void onNonFatalUncaught(String threadName, Throwable t) {
+ fail();
+ }
+
+ });
+ thread.start();
+ thread.join();
+ assertTrue(halt.get());
+ final int status;
+ if (expectedStatus.containsKey(error.getClass())) {
+ status = expectedStatus.get(error.getClass());
+ } else {
+ status = 1;
+ }
+ assertThat(observedStatus.get(), equalTo(status));
+ assertThat(threadNameReference.get(), equalTo(name));
+ assertThat(throwableReference.get(), equalTo(error));
+ }
+
+ public void testUncaughtException() throws InterruptedException {
+ final RuntimeException e = new RuntimeException("boom");
+ final Thread thread = new Thread(() -> { throw e; });
+ final String name = randomAsciiOfLength(10);
+ thread.setName(name);
+ final AtomicReference<String> threadNameReference = new AtomicReference<>();
+ final AtomicReference<Throwable> throwableReference = new AtomicReference<>();
+ thread.setUncaughtExceptionHandler(new ElasticsearchUncaughtExceptionHandler(() -> "testUncaughtException") {
+ @Override
+ void halt(int status) {
+ fail();
+ }
+
+ @Override
+ void onFatalUncaught(String threadName, Throwable t) {
+ fail();
+ }
+
+ @Override
+ void onNonFatalUncaught(String threadName, Throwable t) {
+ threadNameReference.set(threadName);
+ throwableReference.set(t);
+ }
+ });
+ thread.start();
+ thread.join();
+ assertThat(threadNameReference.get(), equalTo(name));
+ assertThat(throwableReference.get(), equalTo(e));
+ }
+
+ public void testIsFatalCause() {
+ assertFatal(new MergePolicy.MergeException(new OutOfMemoryError(), null));
+ assertFatal(new OutOfMemoryError());
+ assertFatal(new StackOverflowError());
+ assertFatal(new InternalError());
+ assertFatal(new UnknownError());
+ assertFatal(new IOError(new IOException()));
+ assertNonFatal(new RuntimeException());
+ assertNonFatal(new UncheckedIOException(new IOException()));
+ }
+
+ private void assertFatal(Throwable cause) {
+ assertTrue(ElasticsearchUncaughtExceptionHandler.isFatalUncaught(cause));
+ }
+
+ private void assertNonFatal(Throwable cause) {
+ assertFalse(ElasticsearchUncaughtExceptionHandler.isFatalUncaught(cause));
+ }
+
+}
diff --git a/distribution/licenses/securesm-1.0.jar.sha1 b/distribution/licenses/securesm-1.0.jar.sha1
deleted file mode 100644
index 96d45d93e6..0000000000
--- a/distribution/licenses/securesm-1.0.jar.sha1
+++ /dev/null
@@ -1 +0,0 @@
-c0c6cf986ba0057390bfcc80c366a0e3157f944b
diff --git a/distribution/licenses/securesm-1.1.jar.sha1 b/distribution/licenses/securesm-1.1.jar.sha1
new file mode 100644
index 0000000000..9144a082b6
--- /dev/null
+++ b/distribution/licenses/securesm-1.1.jar.sha1
@@ -0,0 +1 @@
+1e423447d020041534be94c0f31a49fbdc1f2950 \ No newline at end of file
diff --git a/docs/reference/migration/migrate_5_0/packaging.asciidoc b/docs/reference/migration/migrate_5_0/packaging.asciidoc
index 977e20a76b..74faf3bb7d 100644
--- a/docs/reference/migration/migrate_5_0/packaging.asciidoc
+++ b/docs/reference/migration/migrate_5_0/packaging.asciidoc
@@ -55,3 +55,11 @@ from Elasticsearch.
Additionally, it was previously possible to set any setting in
Elasticsearch via JVM system properties. This has been removed from
Elasticsearch.
+
+==== Dying on fatal errors
+
+Previous versions of Elasticsearch would not halt the JVM if out of memory errors or other fatal
+errors were encountered during the life of the Elasticsearch instance. Because such errors leave
+the JVM in a questionable state, the best course of action is to halt the JVM when this occurs.
+Starting in Elasticsearch 5.x, this is now the case. Operators should consider configuring their
+Elasticsearch services so that they respawn automatically in the case of such a fatal crash.
diff --git a/docs/reference/setup.asciidoc b/docs/reference/setup.asciidoc
index a883b0cc54..ae3c5b3beb 100644
--- a/docs/reference/setup.asciidoc
+++ b/docs/reference/setup.asciidoc
@@ -47,3 +47,5 @@ include::setup/bootstrap-checks.asciidoc[]
include::setup/sysconfig.asciidoc[]
include::setup/upgrade.asciidoc[]
+
+include::setup/stopping.asciidoc[]
diff --git a/docs/reference/setup/stopping.asciidoc b/docs/reference/setup/stopping.asciidoc
new file mode 100644
index 0000000000..1aa281f050
--- /dev/null
+++ b/docs/reference/setup/stopping.asciidoc
@@ -0,0 +1,58 @@
+[[stopping-elasticsearch]]
+=== Stopping Elasticsearch
+
+An orderly shutdown of Elasticsearch ensures that Elasticsearch has a chance to cleanup and close
+outstanding resources. For example, a node that is shutdown in an orderly fashion will remove itself
+from the cluster, sync translogs to disk, and perform other related cleanup activities. You can help
+ensure an orderly shutdown by properly stopping Elasticsearch.
+
+If you're running Elasticsearch as a service, you can stop Elasticsearch via the service management
+functionality provided by your installation.
+
+If you're running Elasticsearch directly, you can stop Elasticsearch by sending control-C if you're
+running Elasticsearch in the console, or by sending `SIGTERM` to the Elasticsearch process on a
+POSIX system. You can obtain the PID to send the signal to via various tools (e.g., `ps` or `jps`):
+
+[source,sh]
+--------------------------------------------------
+$ jps | grep Elasticsearch
+14542 Elasticsearch
+--------------------------------------------------
+
+From the Elasticsearch startup logs:
+
+[source,sh]
+--------------------------------------------------
+[2016-07-07 12:26:18,908][INFO ][node ] [Reaper] version[5.0.0-alpha4], pid[15399], build[3f5b994/2016-06-27T16:23:46.861Z], OS[Mac OS X/10.11.5/x86_64], JVM[Oracle Corporation/Java HotSpot(TM) 64-Bit Server VM/1.8.0_92/25.92-b14]
+--------------------------------------------------
+
+Or by specifying a location to write a PID file to on startup (`-p <path>`):
+
+[source,sh]
+--------------------------------------------------
+$ ./bin/elasticsearch -p /tmp/elasticsearch-pid -d
+$ cat /tmp/elasticsearch-pid && echo
+15516
+$ kill -SIGTERM 15516
+--------------------------------------------------
+
+[[fatal-errors]
+[float]
+=== Stopping on Fatal Errors
+
+During the life of the Elasticsearch virtual machine, certain fatal errors could arise that put the
+virtual machine in a questionable state. Such fatal errors include out of memory errors, internal
+errors in virtual machine, and serious I/O errors.
+
+When Elasticsearch detects that the virtual machine has encountered such a fatal error Elasticsearch
+will attempt to log the error and then will halt the virtual machine. When Elasticsearch initiates
+such a shutdown, it does not go through an orderly shutdown as described above. The Elasticsearch
+process will also return with a special status code indicating the nature of the error.
+
+[horizontal]
+JVM internal error:: 128
+Out of memory error:: 127
+Stack overflow error:: 126
+Unknown virtual machine error:: 125
+Serious I/O error:: 124
+Unknown fatal error:: 1
diff --git a/test/framework/src/main/java/org/elasticsearch/bootstrap/BootstrapForTesting.java b/test/framework/src/main/java/org/elasticsearch/bootstrap/BootstrapForTesting.java
index 5b79721948..fe624297e7 100644
--- a/test/framework/src/main/java/org/elasticsearch/bootstrap/BootstrapForTesting.java
+++ b/test/framework/src/main/java/org/elasticsearch/bootstrap/BootstrapForTesting.java
@@ -150,7 +150,7 @@ public class BootstrapForTesting {
return esPolicy.implies(domain, permission) || testFramework.implies(domain, permission);
}
});
- System.setSecurityManager(new SecureSM(true));
+ System.setSecurityManager(SecureSM.createTestSecureSM());
Security.selfTest();
// guarantee plugin classes are initialized first, in case they have one-time hacks.