Steps to troubleshoot CPU skew
search cancel

Steps to troubleshoot CPU skew

book

Article ID: 406378

calendar_today

Updated On:

Products

VMware Tanzu Greenplum

Issue/Introduction

How to identify CPU skewed queries (queries that are responsible for processing skew) and their associated skewed segment(s) 

Resolution

The following User Defined Function (gpmetrics.gpcc_queries_top_10_cpu_skew(INTERVAL)) can be used to enhance the flexibility of accessing top list of skewed queries — it allows you to specify a time interval when analyzing query CPU skew across segments.

Additional Information

-- Get the top 10 queries utilizing most CPU skew among segments, during the past 1 hour/1 day/1 week/1 month.
-- Finds the queries with top 10 CPU skew among segments in the past hour from now() by default.
DROP FUNCTION IF EXISTS gpmetrics.gpcc_queries_top_10_cpu_skew(INTERVAL);
CREATE OR REPLACE FUNCTION gpmetrics.gpcc_queries_top_10_cpu_skew(
    IN interval_time INTERVAL DEFAULT'1h'::interval,
    OUT query_time TIMESTAMP,
    OUT query_id TEXT,
    OUT user_name VARCHAR,
    OUT database VARCHAR,
    OUT max_cpu integer,
    OUT avg_cpu numeric,
    OUT cpu_skew text,
    OUT max_cpu_segid integer,
    OUT query TEXT)
    RETURNS SETOF RECORD AS
$$
BEGIN
RETURN QUERY
    WITH base AS (
        SELECT
            q.ctime,
            q.tmid,
            q.ssid,
            q.ccnt,
            q.username,
            q.db,
            p.segid,
            s.key::int AS sliceid,
            (s.value->'cpu')::text::int AS cpu,
            substr(q.query_text, 0, 128) AS query_text
        FROM gpmetrics.gpcc_queries_history q
        JOIN json_each(q.slices_metrics) s ON true
        JOIN gpmetrics.gpcc_plannode_history p
          ON (q.tmid, q.ssid, q.ccnt) = (p.tmid, p.ssid, p.ccnt)
         AND s.key::int = p.sliceid
        WHERE q.ctime >= now() - interval_time - interval '2 day'
          AND q.ctime < now() + interval '2 day'
          AND q.tsubmit >= now() - interval_time
    ),
    ranked AS (
        SELECT *,
               MAX(cpu) OVER (PARTITION BY ctime, tmid, ssid, ccnt) AS cpu_max,
               AVG(cpu) OVER (PARTITION BY ctime, tmid, ssid, ccnt) AS cpu_avg,
               ROW_NUMBER() OVER (PARTITION BY ctime, tmid, ssid, ccnt ORDER BY cpu DESC) AS cpu_rank
        FROM base
    ),
    filtered AS (
        SELECT *
        FROM ranked
        WHERE cpu_rank = 1
    )
SELECT
    ctime,
    tmid || '-' || ssid || '-' || ccnt AS query_id,
    username,
    db,
    cpu_max,
    ROUND(cpu_avg, 2) AS cpu_avg,
    ROUND(100.0 - 100.0 * cpu_avg / cpu_max, 2)::text || '%' AS cpu_skew,
        segid AS max_cpu_segid,
    query_text
FROM filtered
WHERE cpu_max > 0
ORDER BY cpu_skew DESC
    LIMIT 10;
END;
$$ LANGUAGE plpgsql;
REVOKE ALL ON FUNCTION gpmetrics.gpcc_queries_top_10_cpu_skew(interval) FROM PUBLIC;
GRANT EXECUTE ON FUNCTION gpmetrics.gpcc_queries_top_10_cpu_skew(interval) TO PUBLIC;

 

> select * from gpmetrics.gpcc_queries_top_10_cpu_skew('1 month');

 

     query_time      |     query_id     | user_name | database | max_cpu | avg_cpu | cpu_skew | max_cpu_segid |         query
---------------------+------------------+-----------+----------+---------+---------+----------+---------------+-----------------------
2025-07-09 17:36:04 | 1752052123-797-3 | gpmon     | postgres |      30 |   20.00 | 33.33%   |             0 | select xxx;
2025-07-09 17:37:44 | 1752052123-842-3 | gpmon     | postgres |      20 |   15.00 | 25.00%   |             0 | select yyy;
(2 rows)