【10】把 Elasticsearch 当数据库使:Drill Down 下钻

828 查看

使用 https://github.com/taowen/es-monitor 可以用 SQL 进行 elasticsearch 的查询。传统的 SQL 的语义是每一层查询是对下一层查询的综合,也就是每SELECT一层行数就会变少。比如

SELECT sum(per_sector) AS total FROM (
    SELECT sector, count(*) AS per_sector FROM (
        SELECT sector, ipo_year FROM symbol
    )
)

Elasticsearch是不支持这样的嵌套SELECT的。它支持一个更实用的功能,嵌套下钻(Drill Down)。与传统SQL的语义正好相反,Elasticsearch的嵌套不是SELECT FROM 而是 SELECT INSIDE,每经过一层SELECT其行数不减少反而增加。所以我把Elasticsearch的这种嵌套查询不称之为SELECT FROM而是SELECT INSIDE以示区别,当然为了习惯,用SELECT FROM也是支持的。

SELECT INSIDE all_symbols GROUP BY ipo_year

SQL

$ cat << EOF | ./es_query.py http://127.0.0.1:9200
    WITH all_symbols AS (SELECT MAX(market_cap) AS max_all_times FROM symbol);
    WITH per_ipo_year AS (SELECT ipo_year, MAX(market_cap) AS max_this_year INSIDE all_symbols 
        GROUP BY ipo_year LIMIT 2)
EOF

第一行SELECT定义了all_symbols,第二行SELECT是在第一次查询的基础之上进行自己的二次聚合。这样多层SELECT的好处是每一层可以定义自己的SELECT字段。比如GROUP BY a,b,c其实做了三次下钻,但是限于SQL的语法这样写是没法给没一层下钻的分桶计算指标,但是用上面这种WITH的语法,就可以每钻一层计算一层的指标。

{"_bucket_path": ["per_ipo_year"], "max_this_year": 54171930444.0, "max_all_times": 522690000000.0, "ipo_year": 2014}
{"_bucket_path": ["per_ipo_year"], "max_this_year": 5416144671.0, "max_all_times": 522690000000.0, "ipo_year": 2015}

Elasticsearch

{
  "aggs": {
    "max_all_times": {
      "max": {
        "field": "market_cap"
      }
    }, 
    "ipo_year": {
      "terms": {
        "field": "ipo_year", 
        "size": 2
      }, 
      "aggs": {
        "max_this_year": {
          "max": {
            "field": "market_cap"
          }
        }
      }
    }
  }, 
  "size": 0
}

下钻的概念直接看Elasticsearch的查询其实更清楚。每下钻一层,括号就往右深了一层。从某种意义上来说,Elasticsearch的DSL其实更接近聚合的实质。

{
  "hits": {
    "hits": [], 
    "total": 6714, 
    "max_score": 0.0
  }, 
  "_shards": {
    "successful": 1, 
    "failed": 0, 
    "total": 1
  }, 
  "took": 2, 
  "aggregations": {
    "max_all_times": {
      "value": 522690000000.0
    }, 
    "ipo_year": {
      "buckets": [
        {
          "max_this_year": {
            "value": 54171930444.0
          }, 
          "key": 2014, 
          "doc_count": 390
        }, 
        {
          "max_this_year": {
            "value": 5416144671.0
          }, 
          "key": 2015, 
          "doc_count": 334
        }
      ], 
      "sum_other_doc_count": 2174, 
      "doc_count_error_upper_bound": 0
    }
  }, 
  "timed_out": false
}

Profile

[
  {
    "query": [
      {
        "query_type": "MatchAllDocsQuery",
        "lucene": "*:*",
        "time": "0.2003040000ms",
        "breakdown": {
          "score": 0,
          "create_weight": 9025,
          "next_doc": 162380,
          "match": 0,
          "build_scorer": 28899,
          "advance": 0
        }
      }
    ],
    "rewrite_time": 2523,
    "collector": [
      {
        "name": "MultiCollector",
        "reason": "search_multi",
        "time": "2.325354000ms",
        "children": [
          {
            "name": "TotalHitCountCollector",
            "reason": "search_count",
            "time": "0.2740410000ms"
          },
          {
            "name": "BucketCollector: [[max_all_times, ipo_year]]",
            "reason": "aggregation",
            "time": "1.295439000ms"
          }
        ]
      }
    ]
  }
]

SELECT INSIDE 多次

SQL

$ cat << EOF | ./es_query.py http://127.0.0.1:9200
    WITH all_symbols AS (SELECT MAX(market_cap) AS max_all_times FROM symbol);
    WITH per_ipo_year AS (SELECT ipo_year, MAX(market_cap) AS max_this_year INSIDE all_symbols 
        GROUP BY ipo_year LIMIT 2);
    WITH per_sector AS (SELECT sector, MAX(market_cap) AS max_this_sector INSIDE per_ipo_year 
        GROUP BY sector LIMIT 2)
EOF

这个和GROUP BY ipo_year, sector其实是差不多的,区别在于对每一层下钻都可以选取这一层的指标出来。

{"sector": "Health Care", "_bucket_path": ["per_ipo_year", "per_sector"], "max_this_year": 54171930444.0, "ipo_year": 2014, "max_all_times": 522690000000.0, "max_this_sector": 2660000000.0}
{"sector": "Finance", "_bucket_path": ["per_ipo_year", "per_sector"], "max_this_year": 54171930444.0, "ipo_year": 2014, "max_all_times": 522690000000.0, "max_this_sector": 5530000000.0}
{"sector": "Finance", "_bucket_path": ["per_ipo_year", "per_sector"], "max_this_year": 5416144671.0, "ipo_year": 2015, "max_all_times": 522690000000.0, "max_this_sector": 2740000000.0}
{"sector": "Health Care", "_bucket_path": ["per_ipo_year", "per_sector"], "max_this_year": 5416144671.0, "ipo_year": 2015, "max_all_times": 522690000000.0, "max_this_sector": 5416144671.0}

Elasticsearch

{
  "aggs": {
    "max_all_times": {
      "max": {
        "field": "market_cap"
      }
    }, 
    "ipo_year": {
      "terms": {
        "field": "ipo_year", 
        "size": 2
      }, 
      "aggs": {
        "sector": {
          "terms": {
            "field": "sector", 
            "size": 2
          }, 
          "aggs": {
            "max_this_sector": {
              "max": {
                "field": "market_cap"
              }
            }
          }
        }, 
        "max_this_year": {
          "max": {
            "field": "market_cap"
          }
        }
      }
    }
  }, 
  "size": 0
}
{
  "hits": {
    "hits": [], 
    "total": 6714, 
    "max_score": 0.0
  }, 
  "_shards": {
    "successful": 1, 
    "failed": 0, 
    "total": 1
  }, 
  "took": 8, 
  "aggregations": {
    "max_all_times": {
      "value": 522690000000.0
    }, 
    "ipo_year": {
      "buckets": [
        {
          "sector": {
            "buckets": [
              {
                "max_this_sector": {
                  "value": 2660000000.0
                }, 
                "key": "Health Care", 
                "doc_count": 104
              }, 
              {
                "max_this_sector": {
                  "value": 5530000000.0
                }, 
                "key": "Finance", 
                "doc_count": 70
              }
            ], 
            "sum_other_doc_count": 216, 
            "doc_count_error_upper_bound": 0
          }, 
          "max_this_year": {
            "value": 54171930444.0
          }, 
          "key": 2014, 
          "doc_count": 390
        }, 
        {
          "sector": {
            "buckets": [
              {
                "max_this_sector": {
                  "value": 2740000000.0
                }, 
                "key": "Finance", 
                "doc_count": 92
              }, 
              {
                "max_this_sector": {
                  "value": 5416144671.0
                }, 
                "key": "Health Care", 
                "doc_count": 92
              }
            ], 
            "sum_other_doc_count": 150, 
            "doc_count_error_upper_bound": 0
          }, 
          "max_this_year": {
            "value": 5416144671.0
          }, 
          "key": 2015, 
          "doc_count": 334
        }
      ], 
      "sum_other_doc_count": 2174, 
      "doc_count_error_upper_bound": 0
    }
  }, 
  "timed_out": false
}

Profile

[
  {
    "query": [
      {
        "query_type": "MatchAllDocsQuery",
        "lucene": "*:*",
        "time": "0.2576120000ms",
        "breakdown": {
          "score": 0,
          "create_weight": 63193,
          "next_doc": 165400,
          "match": 0,
          "build_scorer": 29019,
          "advance": 0
        }
      }
    ],
    "rewrite_time": 3205,
    "collector": [
      {
        "name": "MultiCollector",
        "reason": "search_multi",
        "time": "6.292688000ms",
        "children": [
          {
            "name": "TotalHitCountCollector",
            "reason": "search_count",
            "time": "0.2599140000ms"
          },
          {
            "name": "BucketCollector: [[max_all_times, ipo_year]]",
            "reason": "aggregation",
            "time": "5.172211000ms"
          }
        ]
      }
    ]
  }
]

相信看过前面例子,你应该理解了什么叫下钻了。每下钻一层,前面的一行就会被再次分裂到多个桶里。每一层都可以搞自己的指标计算。