【08】把 Elasticsearch 当数据库使:计算后再聚合

835 查看

使用 https://github.com/taowen/es-monitor 可以用 SQL 进行 elasticsearch 的查询。前面我们了解到在聚合之前可以做很多种变换,然后把变换之后的key再去分桶。这种变换的终极形式就是自定义表达式,当然自定义表达式的效率也是最低的。

GROUP BY ipo_year % AS ipo_year_rem

SQL

$ cat << EOF | ./es_query.py http://127.0.0.1:9200
SELECT ipo_year_rem, COUNT(*) FROM symbol GROUP BY ipo_year % 5 AS ipo_year_rem  
EOF
{"COUNT(*)": 715, "ipo_year_rem": 4.0}
{"COUNT(*)": 677, "ipo_year_rem": 0.0}
{"COUNT(*)": 537, "ipo_year_rem": 2.0}
{"COUNT(*)": 523, "ipo_year_rem": 3.0}
{"COUNT(*)": 446, "ipo_year_rem": 1.0}

Elasticsearch

{
  "aggs": {
    "ipo_year_rem": {
      "terms": {
        "field": "ipo_year", 
        "size": 0, 
        "script": {
          "lang": "expression", 
          "inline": "_value % 5"
        }
      }, 
      "aggs": {}
    }
  }, 
  "size": 0
}
{
  "hits": {
    "hits": [], 
    "total": 6714, 
    "max_score": 0.0
  }, 
  "_shards": {
    "successful": 1, 
    "failed": 0, 
    "total": 1
  }, 
  "took": 2, 
  "aggregations": {
    "ipo_year_rem": {
      "buckets": [
        {
          "key": 4.0, 
          "doc_count": 715
        }, 
        {
          "key": 0.0, 
          "doc_count": 677
        }, 
        {
          "key": 2.0, 
          "doc_count": 537
        }, 
        {
          "key": 3.0, 
          "doc_count": 523
        }, 
        {
          "key": 1.0, 
          "doc_count": 446
        }
      ], 
      "sum_other_doc_count": 0, 
      "doc_count_error_upper_bound": 0
    }
  }, 
  "timed_out": false
}

Profile

[
  {
    "query": [
      {
        "query_type": "MatchAllDocsQuery",
        "lucene": "*:*",
        "time": "1.046876000ms",
        "breakdown": {
          "score": 0,
          "create_weight": 16550,
          "next_doc": 835828,
          "match": 0,
          "build_scorer": 194498,
          "advance": 0
        }
      }
    ],
    "rewrite_time": 8693,
    "collector": [
      {
        "name": "MultiCollector",
        "reason": "search_multi",
        "time": "2.994827000ms",
        "children": [
          {
            "name": "TotalHitCountCollector",
            "reason": "search_count",
            "time": "0.3955230000ms"
          },
          {
            "name": "DoubleTermsAggregator: [ipo_year_rem]",
            "reason": "aggregation",
            "time": "0.9664500000ms"
          }
        ]
      }
    ]
  }
]

GROUP BY floor(market_cap / last_sale)

SQL

$ cat << EOF | ./es_query.py http://127.0.0.1:9200
SELECT shares_count, COUNT(*) FROM symbol GROUP BY floor(market_cap / last_sale / 1000000)  AS shares_count ORDER BY shares_count LIMIT 3
EOF
{"shares_count": "0.0", "COUNT(*)": 6007}
{"shares_count": "1.0", "COUNT(*)": 328}
{"shares_count": "10.0", "COUNT(*)": 6}

Elasticsearch

{
  "aggs": {
    "shares_count": {
      "terms": {
        "size": 3, 
        "order": {
          "_term": "asc"
        }, 
        "script": {
          "lang": "expression", 
          "inline": "floor(doc['market_cap'].value / doc['last_sale'].value / 1000000)"
        }
      }, 
      "aggs": {}
    }
  }, 
  "size": 0
}

如果引用了多个字段,则无法使用_value,而只能用doc['market_cap'].value 这样来引用字段。

{
  "hits": {
    "hits": [], 
    "total": 6714, 
    "max_score": 0.0
  }, 
  "_shards": {
    "successful": 1, 
    "failed": 0, 
    "total": 1
  }, 
  "took": 6, 
  "aggregations": {
    "shares_count": {
      "buckets": [
        {
          "key": "0.0", 
          "doc_count": 6007
        }, 
        {
          "key": "1.0", 
          "doc_count": 328
        }, 
        {
          "key": "10.0", 
          "doc_count": 6
        }
      ], 
      "sum_other_doc_count": 373, 
      "doc_count_error_upper_bound": 0
    }
  }, 
  "timed_out": false
}

Profile

[
  {
    "query": [
      {
        "query_type": "MatchAllDocsQuery",
        "lucene": "*:*",
        "time": "0.5422700000ms",
        "breakdown": {
          "score": 0,
          "create_weight": 14630,
          "next_doc": 475299,
          "match": 0,
          "build_scorer": 52341,
          "advance": 0
        }
      }
    ],
    "rewrite_time": 5085,
    "collector": [
      {
        "name": "MultiCollector",
        "reason": "search_multi",
        "time": "7.627612000ms",
        "children": [
          {
            "name": "TotalHitCountCollector",
            "reason": "search_count",
            "time": "0.4945110000ms"
          },
          {
            "name": "StringTermsAggregator: [shares_count]",
            "reason": "aggregation",
            "time": "5.193886000ms"
          }
        ]
      }
    ]
  }
]

可以看到这里使用的是 StringTermsAggregator,这说明了计算结果是用字符串来排序的。