diff options
author | Jason Tedor <jason@tedor.me> | 2016-07-07 14:44:03 -0400 |
---|---|---|
committer | GitHub <noreply@github.com> | 2016-07-07 14:44:03 -0400 |
commit | e86aa29f671d46788c7610691f900b2547b549ff (patch) | |
tree | f6d96e838aa7e8fff94bc5863fc630c1c64535af | |
parent | d3f8329a3dbd182ff69f74d4e177fc7a4550f1af (diff) |
Die with dignity
Today when a thread encounters a fatal unrecoverable error that
threatens the stability of the JVM, Elasticsearch marches on. This
includes out of memory errors, stack overflow errors and other errors
that leave the JVM in a questionable state. Instead, the Elasticsearch
JVM should die when these errors are encountered. This commit causes
this to be the case.
Relates #19272
12 files changed, 325 insertions, 5 deletions
diff --git a/core/build.gradle b/core/build.gradle index 513e998ebd..6fd8c62af3 100644 --- a/core/build.gradle +++ b/core/build.gradle @@ -56,7 +56,7 @@ dependencies { compile "org.apache.lucene:lucene-spatial3d:${versions.lucene}" compile "org.apache.lucene:lucene-suggest:${versions.lucene}" - compile 'org.elasticsearch:securesm:1.0' + compile 'org.elasticsearch:securesm:1.1' // utilities compile 'net.sf.jopt-simple:jopt-simple:5.0.2' diff --git a/core/src/main/java/org/elasticsearch/bootstrap/Bootstrap.java b/core/src/main/java/org/elasticsearch/bootstrap/Bootstrap.java index 3d5347307d..bdec058b04 100644 --- a/core/src/main/java/org/elasticsearch/bootstrap/Bootstrap.java +++ b/core/src/main/java/org/elasticsearch/bootstrap/Bootstrap.java @@ -246,6 +246,12 @@ final class Bootstrap { // fail if somebody replaced the lucene jars checkLucene(); + // install the default uncaught exception handler; must be done before security is + // initialized as we do not want to grant the runtime permission + // setDefaultUncaughtExceptionHandler + Thread.setDefaultUncaughtExceptionHandler( + new ElasticsearchUncaughtExceptionHandler(() -> Node.NODE_NAME_SETTING.get(settings))); + INSTANCE.setup(true, settings, environment); INSTANCE.start(); diff --git a/core/src/main/java/org/elasticsearch/bootstrap/ElasticsearchUncaughtExceptionHandler.java b/core/src/main/java/org/elasticsearch/bootstrap/ElasticsearchUncaughtExceptionHandler.java new file mode 100644 index 0000000000..405e919fab --- /dev/null +++ b/core/src/main/java/org/elasticsearch/bootstrap/ElasticsearchUncaughtExceptionHandler.java @@ -0,0 +1,94 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.bootstrap; + +import org.apache.lucene.index.MergePolicy; +import org.elasticsearch.common.SuppressForbidden; +import org.elasticsearch.common.logging.ESLogger; +import org.elasticsearch.common.logging.Loggers; + +import java.io.IOError; +import java.util.Objects; +import java.util.function.Supplier; + +class ElasticsearchUncaughtExceptionHandler implements Thread.UncaughtExceptionHandler { + + private final Supplier<String> loggingPrefixSupplier; + + ElasticsearchUncaughtExceptionHandler(final Supplier<String> loggingPrefixSupplier) { + this.loggingPrefixSupplier = Objects.requireNonNull(loggingPrefixSupplier); + } + + @Override + public void uncaughtException(Thread t, Throwable e) { + if (isFatalUncaught(e)) { + try { + onFatalUncaught(t.getName(), e); + } finally { + // we use specific error codes in case the above notification failed, at least we + // will have some indication of the error bringing us down + if (e instanceof InternalError) { + halt(128); + } else if (e instanceof OutOfMemoryError) { + halt(127); + } else if (e instanceof StackOverflowError) { + halt(126); + } else if (e instanceof UnknownError) { + halt(125); + } else if (e instanceof IOError) { + halt(124); + } else { + halt(1); + } + } + } else { + onNonFatalUncaught(t.getName(), e); + } + } + + // visible for testing + static boolean isFatalUncaught(Throwable e) { + return isFatalCause(e) || (e instanceof MergePolicy.MergeException && isFatalCause(e.getCause())); + } + + private static boolean isFatalCause(Throwable cause) { + return cause instanceof Error; + } + + // visible for testing + void onFatalUncaught(final String threadName, final Throwable t) { + final ESLogger logger = Loggers.getLogger(ElasticsearchUncaughtExceptionHandler.class, loggingPrefixSupplier.get()); + logger.error("fatal error in thread [{}], exiting", t, threadName); + } + + // visible for testing + void onNonFatalUncaught(final String threadName, final Throwable t) { + final ESLogger logger = Loggers.getLogger(ElasticsearchUncaughtExceptionHandler.class, loggingPrefixSupplier.get()); + logger.warn("uncaught exception in thread [{}]", t, threadName); + } + + // visible for testing + @SuppressForbidden(reason = "halt") + void halt(int status) { + // we halt to prevent shutdown hooks from running + Runtime.getRuntime().halt(status); + } + +} diff --git a/core/src/main/java/org/elasticsearch/bootstrap/Security.java b/core/src/main/java/org/elasticsearch/bootstrap/Security.java index c44b46c6b0..05d2c8c1bf 100644 --- a/core/src/main/java/org/elasticsearch/bootstrap/Security.java +++ b/core/src/main/java/org/elasticsearch/bootstrap/Security.java @@ -120,7 +120,7 @@ final class Security { Policy.setPolicy(new ESPolicy(createPermissions(environment), getPluginPermissions(environment), filterBadDefaults)); // enable security manager - System.setSecurityManager(new SecureSM()); + System.setSecurityManager(new SecureSM(new String[] { "org.elasticsearch.bootstrap." })); // do some basic tests selfTest(); diff --git a/core/src/main/resources/org/elasticsearch/bootstrap/security.policy b/core/src/main/resources/org/elasticsearch/bootstrap/security.policy index dcf37d3dd5..a4a45585b5 100644 --- a/core/src/main/resources/org/elasticsearch/bootstrap/security.policy +++ b/core/src/main/resources/org/elasticsearch/bootstrap/security.policy @@ -24,7 +24,7 @@ //// SecurityManager impl: //// Must have all permissions to properly perform access checks -grant codeBase "${codebase.securesm-1.0.jar}" { +grant codeBase "${codebase.securesm-1.1.jar}" { permission java.security.AllPermission; }; diff --git a/core/src/test/java/org/elasticsearch/bootstrap/ElasticsearchUncaughtExceptionHandlerTests.java b/core/src/test/java/org/elasticsearch/bootstrap/ElasticsearchUncaughtExceptionHandlerTests.java new file mode 100644 index 0000000000..e4ff83e9b4 --- /dev/null +++ b/core/src/test/java/org/elasticsearch/bootstrap/ElasticsearchUncaughtExceptionHandlerTests.java @@ -0,0 +1,152 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.bootstrap; + +import org.apache.lucene.index.MergePolicy; +import org.elasticsearch.test.ESTestCase; +import org.junit.Before; + +import java.io.IOError; +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicReference; + +import static org.hamcrest.CoreMatchers.equalTo; + +public class ElasticsearchUncaughtExceptionHandlerTests extends ESTestCase { + + private Map<Class<? extends Error>, Integer> expectedStatus; + + @Before + public void setUp() throws Exception { + super.setUp(); + Map<Class<? extends Error>, Integer> expectedStatus = new HashMap<>(); + expectedStatus.put(InternalError.class, 128); + expectedStatus.put(OutOfMemoryError.class, 127); + expectedStatus.put(StackOverflowError.class, 126); + expectedStatus.put(UnknownError.class, 125); + expectedStatus.put(IOError.class, 124); + this.expectedStatus = Collections.unmodifiableMap(expectedStatus); + } + + public void testUncaughtError() throws InterruptedException { + final Error error = randomFrom( + new InternalError(), + new OutOfMemoryError(), + new StackOverflowError(), + new UnknownError(), + new IOError(new IOException("fatal")), + new Error() {}); + final Thread thread = new Thread(() -> { throw error; }); + final String name = randomAsciiOfLength(10); + thread.setName(name); + final AtomicBoolean halt = new AtomicBoolean(); + final AtomicInteger observedStatus = new AtomicInteger(); + final AtomicReference<String> threadNameReference = new AtomicReference<>(); + final AtomicReference<Throwable> throwableReference = new AtomicReference<>(); + thread.setUncaughtExceptionHandler(new ElasticsearchUncaughtExceptionHandler(() -> "testUncaughtError") { + + @Override + void halt(int status) { + halt.set(true); + observedStatus.set(status); + } + + @Override + void onFatalUncaught(String threadName, Throwable t) { + threadNameReference.set(threadName); + throwableReference.set(t); + } + + @Override + void onNonFatalUncaught(String threadName, Throwable t) { + fail(); + } + + }); + thread.start(); + thread.join(); + assertTrue(halt.get()); + final int status; + if (expectedStatus.containsKey(error.getClass())) { + status = expectedStatus.get(error.getClass()); + } else { + status = 1; + } + assertThat(observedStatus.get(), equalTo(status)); + assertThat(threadNameReference.get(), equalTo(name)); + assertThat(throwableReference.get(), equalTo(error)); + } + + public void testUncaughtException() throws InterruptedException { + final RuntimeException e = new RuntimeException("boom"); + final Thread thread = new Thread(() -> { throw e; }); + final String name = randomAsciiOfLength(10); + thread.setName(name); + final AtomicReference<String> threadNameReference = new AtomicReference<>(); + final AtomicReference<Throwable> throwableReference = new AtomicReference<>(); + thread.setUncaughtExceptionHandler(new ElasticsearchUncaughtExceptionHandler(() -> "testUncaughtException") { + @Override + void halt(int status) { + fail(); + } + + @Override + void onFatalUncaught(String threadName, Throwable t) { + fail(); + } + + @Override + void onNonFatalUncaught(String threadName, Throwable t) { + threadNameReference.set(threadName); + throwableReference.set(t); + } + }); + thread.start(); + thread.join(); + assertThat(threadNameReference.get(), equalTo(name)); + assertThat(throwableReference.get(), equalTo(e)); + } + + public void testIsFatalCause() { + assertFatal(new MergePolicy.MergeException(new OutOfMemoryError(), null)); + assertFatal(new OutOfMemoryError()); + assertFatal(new StackOverflowError()); + assertFatal(new InternalError()); + assertFatal(new UnknownError()); + assertFatal(new IOError(new IOException())); + assertNonFatal(new RuntimeException()); + assertNonFatal(new UncheckedIOException(new IOException())); + } + + private void assertFatal(Throwable cause) { + assertTrue(ElasticsearchUncaughtExceptionHandler.isFatalUncaught(cause)); + } + + private void assertNonFatal(Throwable cause) { + assertFalse(ElasticsearchUncaughtExceptionHandler.isFatalUncaught(cause)); + } + +} diff --git a/distribution/licenses/securesm-1.0.jar.sha1 b/distribution/licenses/securesm-1.0.jar.sha1 deleted file mode 100644 index 96d45d93e6..0000000000 --- a/distribution/licenses/securesm-1.0.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -c0c6cf986ba0057390bfcc80c366a0e3157f944b diff --git a/distribution/licenses/securesm-1.1.jar.sha1 b/distribution/licenses/securesm-1.1.jar.sha1 new file mode 100644 index 0000000000..9144a082b6 --- /dev/null +++ b/distribution/licenses/securesm-1.1.jar.sha1 @@ -0,0 +1 @@ +1e423447d020041534be94c0f31a49fbdc1f2950
\ No newline at end of file diff --git a/docs/reference/migration/migrate_5_0/packaging.asciidoc b/docs/reference/migration/migrate_5_0/packaging.asciidoc index 977e20a76b..74faf3bb7d 100644 --- a/docs/reference/migration/migrate_5_0/packaging.asciidoc +++ b/docs/reference/migration/migrate_5_0/packaging.asciidoc @@ -55,3 +55,11 @@ from Elasticsearch. Additionally, it was previously possible to set any setting in Elasticsearch via JVM system properties. This has been removed from Elasticsearch. + +==== Dying on fatal errors + +Previous versions of Elasticsearch would not halt the JVM if out of memory errors or other fatal +errors were encountered during the life of the Elasticsearch instance. Because such errors leave +the JVM in a questionable state, the best course of action is to halt the JVM when this occurs. +Starting in Elasticsearch 5.x, this is now the case. Operators should consider configuring their +Elasticsearch services so that they respawn automatically in the case of such a fatal crash. diff --git a/docs/reference/setup.asciidoc b/docs/reference/setup.asciidoc index a883b0cc54..ae3c5b3beb 100644 --- a/docs/reference/setup.asciidoc +++ b/docs/reference/setup.asciidoc @@ -47,3 +47,5 @@ include::setup/bootstrap-checks.asciidoc[] include::setup/sysconfig.asciidoc[] include::setup/upgrade.asciidoc[] + +include::setup/stopping.asciidoc[] diff --git a/docs/reference/setup/stopping.asciidoc b/docs/reference/setup/stopping.asciidoc new file mode 100644 index 0000000000..1aa281f050 --- /dev/null +++ b/docs/reference/setup/stopping.asciidoc @@ -0,0 +1,58 @@ +[[stopping-elasticsearch]] +=== Stopping Elasticsearch + +An orderly shutdown of Elasticsearch ensures that Elasticsearch has a chance to cleanup and close +outstanding resources. For example, a node that is shutdown in an orderly fashion will remove itself +from the cluster, sync translogs to disk, and perform other related cleanup activities. You can help +ensure an orderly shutdown by properly stopping Elasticsearch. + +If you're running Elasticsearch as a service, you can stop Elasticsearch via the service management +functionality provided by your installation. + +If you're running Elasticsearch directly, you can stop Elasticsearch by sending control-C if you're +running Elasticsearch in the console, or by sending `SIGTERM` to the Elasticsearch process on a +POSIX system. You can obtain the PID to send the signal to via various tools (e.g., `ps` or `jps`): + +[source,sh] +-------------------------------------------------- +$ jps | grep Elasticsearch +14542 Elasticsearch +-------------------------------------------------- + +From the Elasticsearch startup logs: + +[source,sh] +-------------------------------------------------- +[2016-07-07 12:26:18,908][INFO ][node ] [Reaper] version[5.0.0-alpha4], pid[15399], build[3f5b994/2016-06-27T16:23:46.861Z], OS[Mac OS X/10.11.5/x86_64], JVM[Oracle Corporation/Java HotSpot(TM) 64-Bit Server VM/1.8.0_92/25.92-b14] +-------------------------------------------------- + +Or by specifying a location to write a PID file to on startup (`-p <path>`): + +[source,sh] +-------------------------------------------------- +$ ./bin/elasticsearch -p /tmp/elasticsearch-pid -d +$ cat /tmp/elasticsearch-pid && echo +15516 +$ kill -SIGTERM 15516 +-------------------------------------------------- + +[[fatal-errors] +[float] +=== Stopping on Fatal Errors + +During the life of the Elasticsearch virtual machine, certain fatal errors could arise that put the +virtual machine in a questionable state. Such fatal errors include out of memory errors, internal +errors in virtual machine, and serious I/O errors. + +When Elasticsearch detects that the virtual machine has encountered such a fatal error Elasticsearch +will attempt to log the error and then will halt the virtual machine. When Elasticsearch initiates +such a shutdown, it does not go through an orderly shutdown as described above. The Elasticsearch +process will also return with a special status code indicating the nature of the error. + +[horizontal] +JVM internal error:: 128 +Out of memory error:: 127 +Stack overflow error:: 126 +Unknown virtual machine error:: 125 +Serious I/O error:: 124 +Unknown fatal error:: 1 diff --git a/test/framework/src/main/java/org/elasticsearch/bootstrap/BootstrapForTesting.java b/test/framework/src/main/java/org/elasticsearch/bootstrap/BootstrapForTesting.java index 5b79721948..fe624297e7 100644 --- a/test/framework/src/main/java/org/elasticsearch/bootstrap/BootstrapForTesting.java +++ b/test/framework/src/main/java/org/elasticsearch/bootstrap/BootstrapForTesting.java @@ -150,7 +150,7 @@ public class BootstrapForTesting { return esPolicy.implies(domain, permission) || testFramework.implies(domain, permission); } }); - System.setSecurityManager(new SecureSM(true)); + System.setSecurityManager(SecureSM.createTestSecureSM()); Security.selfTest(); // guarantee plugin classes are initialized first, in case they have one-time hacks. |