diff options
author | Gautam Parai <gparai@maprtech.com> | 2014-08-21 14:59:53 -0700 |
---|---|---|
committer | Gautam Parai <gparai@maprtech.com> | 2019-02-28 12:01:24 -0800 |
commit | 469be17597e7b7c6bc1de9863dcb6c5604a55f0c (patch) | |
tree | 76a1c2572cfb19a75a0f82e6d165db333797fe3b /contrib/native/client/src | |
parent | 3233d8aaff57ac71bd3b726efcd5fdaa92aef861 (diff) |
DRILL-1328: Support table statistics - Part 2
Add support for avg row-width and major type statistics.
Parallelize the ANALYZE implementation and stats UDF implementation to improve stats collection performance.
Update/fix rowcount, selectivity and ndv computations to improve plan costing.
Add options for configuring collection/usage of statistics.
Add new APIs and implementation for stats writer (as a precursor to Drill Metastore APIs).
Fix several stats/costing related issues identified while running TPC-H nad TPC-DS queries.
Add support for CPU sampling and nested scalar columns.
Add more testcases for collection and usage of statistics and fix remaining unit/functional test failures.
Thanks to Venki Korukanti (@vkorukanti) for the description below (modified to account for new changes). He graciously agreed to rebase the patch to latest master, fixed few issues and added few tests.
FUNCS: Statistics functions as UDFs:
Separate
Currently using FieldReader to ensure consistent output type so that Unpivot doesn't get confused. All stats columns should be Nullable, so that stats functions can return NULL when N/A.
* custom versions of "count" that always return BigInt
* HyperLogLog based NDV that returns BigInt that works only on VarChars
* HyperLogLog with binary output that only works on VarChars
OPS: Updated protobufs for new ops
OPS: Implemented StatisticsMerge
OPS: Implemented StatisticsUnpivot
ANALYZE: AnalyzeTable functionality
* JavaCC syntax more-or-less copied from LucidDB.
* (Basic) AnalyzePrule: DrillAnalyzeRel -> UnpivotPrel StatsMergePrel FilterPrel(for sampling) StatsAggPrel ScanPrel
ANALYZE: Add getMetadataTable() to AbstractSchema
USAGE: Change field access in QueryWrapper
USAGE: Add getDrillTable() to DrillScanRelBase and ScanPrel
* since ScanPrel does not inherit from DrillScanRelBase, this requires adding a DrillTable to the constructor
* This is done so that a custom ReflectiveRelMetadataProvider can access the DrillTable associated with Logical/Physical scans.
USAGE: Attach DrillStatsTable to DrillTable.
* DrillStatsTable represents the data scanned from a corresponding ".stats.drill" table
* In order to avoid doing query execution right after the ".stats.drill" table is found, metadata is not actually collected until the MaterializationVisitor is used.
** Currently, the metadata source must be a string (so that a SQL query can be created). Doing this with a table is probably more complicated.
** Query is set up to extract only the most recent statistics results for each column.
closes #729
Diffstat (limited to 'contrib/native/client/src')
-rw-r--r-- | contrib/native/client/src/protobuf/UserBitShared.pb.cc | 15 | ||||
-rw-r--r-- | contrib/native/client/src/protobuf/UserBitShared.pb.h | 7 |
2 files changed, 15 insertions, 7 deletions
diff --git a/contrib/native/client/src/protobuf/UserBitShared.pb.cc b/contrib/native/client/src/protobuf/UserBitShared.pb.cc index 1e0712091..0db64d348 100644 --- a/contrib/native/client/src/protobuf/UserBitShared.pb.cc +++ b/contrib/native/client/src/protobuf/UserBitShared.pb.cc @@ -754,7 +754,7 @@ void protobuf_AddDesc_UserBitShared_2eproto() { "entState\022\013\n\007SENDING\020\000\022\027\n\023AWAITING_ALLOCA" "TION\020\001\022\013\n\007RUNNING\020\002\022\014\n\010FINISHED\020\003\022\r\n\tCAN" "CELLED\020\004\022\n\n\006FAILED\020\005\022\032\n\026CANCELLATION_REQ" - "UESTED\020\006*\247\t\n\020CoreOperatorType\022\021\n\rSINGLE_" + "UESTED\020\006*\351\t\n\020CoreOperatorType\022\021\n\rSINGLE_" "SENDER\020\000\022\024\n\020BROADCAST_SENDER\020\001\022\n\n\006FILTER" "\020\002\022\022\n\016HASH_AGGREGATE\020\003\022\r\n\tHASH_JOIN\020\004\022\016\n" "\nMERGE_JOIN\020\005\022\031\n\025HASH_PARTITION_SENDER\020\006" @@ -784,10 +784,12 @@ void protobuf_AddDesc_UserBitShared_2eproto() { "\025\n\021SEQUENCE_SUB_SCAN\0205\022\023\n\017PARTITION_LIMI" "T\0206\022\023\n\017PCAPNG_SUB_SCAN\0207\022\022\n\016RUNTIME_FILT" "ER\0208\022\017\n\013ROWKEY_JOIN\0209\022\023\n\017SYSLOG_SUB_SCAN" - "\020:*g\n\nSaslStatus\022\020\n\014SASL_UNKNOWN\020\000\022\016\n\nSA" - "SL_START\020\001\022\024\n\020SASL_IN_PROGRESS\020\002\022\020\n\014SASL" - "_SUCCESS\020\003\022\017\n\013SASL_FAILED\020\004B.\n\033org.apach" - "e.drill.exec.protoB\rUserBitSharedH\001", 5555); + "\020:\022\030\n\024STATISTICS_AGGREGATE\020;\022\020\n\014UNPIVOT_" + "MAPS\020<\022\024\n\020STATISTICS_MERGE\020=*g\n\nSaslStat" + "us\022\020\n\014SASL_UNKNOWN\020\000\022\016\n\nSASL_START\020\001\022\024\n\020" + "SASL_IN_PROGRESS\020\002\022\020\n\014SASL_SUCCESS\020\003\022\017\n\013" + "SASL_FAILED\020\004B.\n\033org.apache.drill.exec.p" + "rotoB\rUserBitSharedH\001", 5621); ::google::protobuf::MessageFactory::InternalRegisterGeneratedFile( "UserBitShared.proto", &protobuf_RegisterTypes); UserCredentials::default_instance_ = new UserCredentials(); @@ -967,6 +969,9 @@ bool CoreOperatorType_IsValid(int value) { case 56: case 57: case 58: + case 59: + case 60: + case 61: return true; default: return false; diff --git a/contrib/native/client/src/protobuf/UserBitShared.pb.h b/contrib/native/client/src/protobuf/UserBitShared.pb.h index b95b311c8..a8e6ccba6 100644 --- a/contrib/native/client/src/protobuf/UserBitShared.pb.h +++ b/contrib/native/client/src/protobuf/UserBitShared.pb.h @@ -262,11 +262,14 @@ enum CoreOperatorType { PCAPNG_SUB_SCAN = 55, RUNTIME_FILTER = 56, ROWKEY_JOIN = 57, - SYSLOG_SUB_SCAN = 58 + SYSLOG_SUB_SCAN = 58, + STATISTICS_AGGREGATE = 59, + UNPIVOT_MAPS = 60, + STATISTICS_MERGE = 61 }; bool CoreOperatorType_IsValid(int value); const CoreOperatorType CoreOperatorType_MIN = SINGLE_SENDER; -const CoreOperatorType CoreOperatorType_MAX = SYSLOG_SUB_SCAN; +const CoreOperatorType CoreOperatorType_MAX = STATISTICS_MERGE; const int CoreOperatorType_ARRAYSIZE = CoreOperatorType_MAX + 1; const ::google::protobuf::EnumDescriptor* CoreOperatorType_descriptor(); |