From c297bae697c1b61a6a66712f42b0d67e1bdfd90c Mon Sep 17 00:00:00 2001 From: towards-a-new-leftypol Date: Thu, 26 Oct 2023 18:19:32 -0400 Subject: [PATCH] - Change fetch_catalog and fetch_top_threads functions - neither was correct - fetch_top_threads doesn't need to be a loop, it will be up to the application whether or not to load more results. This simplifies the function, which was sometimes returning more than one row for each thread_id anyway - fetch_catalog is now fairly performant, and fetches all relevant information --- sql/archive_tests.sql | 200 ++++++++++++++++++++++++++++++++++++-- sql/archive_tests2.sql | 216 +++++++++++++++++++++++++++++++++++++++++ sql/initialize.sql | 89 +++++++++++++++++ 3 files changed, 496 insertions(+), 9 deletions(-) create mode 100644 sql/archive_tests2.sql diff --git a/sql/archive_tests.sql b/sql/archive_tests.sql index 7b9dce8..83a451b 100644 --- a/sql/archive_tests.sql +++ b/sql/archive_tests.sql @@ -28,7 +28,7 @@ LIMIT 100; - SELECT +SELECT thread_data.thread_id, thread_data.bump_time, thread_data.first_post_subject, @@ -313,6 +313,16 @@ group by thread_id order by bump_time desc; +select max(creation_time) as bump_time, count(*), thread_id, min(creation_time) as where_to_leave_off +from + ( + select thread_id, creation_time, body from posts + where creation_time < NOW() - interval '365 day' + order by creation_time desc limit 10000 + ) as t +group by thread_id +order by bump_time desc; + explain analyze SELECT @@ -330,8 +340,8 @@ LIMIT 100 -- Create temp table if not exists -drop table if exists temp_results; -CREATE TEMP TABLE IF NOT EXISTS temp_results (bump_time TIMESTAMPTZ, post_count INT, thread_id INT, last_fetched_time TIMESTAMP); +DROP TABLE IF EXISTS temp_results; + /* * This function scans backwards from p_start_time until we get the desired number of threads. @@ -351,7 +361,8 @@ DECLARE max_iterations INT := 10; -- Maximum iterations to avoid endless loop in case of errors result_count INT := 0; last_min_time TIMESTAMP; -BEGIN +BEGIN + CREATE TEMP TABLE IF NOT EXISTS temp_results (bump_time TIMESTAMPTZ, post_count INT, thread_id INT, last_fetched_time TIMESTAMP); TRUNCATE temp_results; -- clear the table FOR i IN 1..max_iterations LOOP @@ -368,9 +379,7 @@ BEGIN GROUP BY t.thread_id; -- Check if we have enough threads - --SELECT temp_results.thread_id INTO result_count FROM temp_results; - SELECT COUNT(DISTINCT temp_results.thread_id) INTO result_count FROM temp_results; -- specify the table name here to avoid ambiguity - + SELECT COUNT(DISTINCT temp_results.thread_id) INTO result_count FROM temp_results; IF result_count >= p_desired_threads THEN EXIT; @@ -387,13 +396,14 @@ BEGIN -- Return the results RETURN QUERY SELECT temp_results.bump_time, temp_results.post_count, temp_results.thread_id FROM temp_results ORDER BY temp_results.bump_time DESC; + DROP TABLE IF EXISTS temp_results; END; $$; -select * FROM fetch_top_threads(NOW(), 300) offset 0; +select * FROM fetch_top_threads(NOW(), 100); -explain analyze WITH TopThreads AS ( +WITH TopThreads AS ( SELECT * FROM fetch_top_threads(NOW() - interval '365d', 100) ) SELECT @@ -409,3 +419,175 @@ JOIN posts p ON tt.thread_id = p.thread_id ORDER BY p.creation_time asc; + + +SELECT * +FROM + fetch_top_threads(NOW() - interval '365d', 100) top +JOIN + posts ON posts.thread_id = top.thread_id +ORDER BY posts.creation_time ASC; + + +SELECT + post_id, + board_post_id, + posts.thread_id, + bump_time, + creation_time, + body +FROM + fetch_top_threads(NOW() - interval '365d', 100) top +JOIN + posts ON posts.thread_id = top.thread_id +ORDER BY posts.creation_time ASC; + + +VACUUM ANALYZE posts; + +SELECT posts.post_id, posts.thread_id, posts.body +FROM + fetch_top_threads(NOW(), 100) top +JOIN + posts ON posts.thread_id = top.thread_id; + + +WITH +top AS ( + SELECT * FROM fetch_top_threads(NOW(), 100) top +), +joined AS ( + SELECT posts.post_id, posts.thread_id, posts.body + FROM top JOIN posts ON posts.thread_id = top.thread_id + ORDER BY posts.creation_time DESC +), +grouped AS ( + SELECT min(joined.post_id) op_post_id, joined.thread_id FROM joined + GROUP BY joined.thread_id +) SELECT * FROM grouped JOIN joined ON op_post_id = joined.post_id; + + +WITH +top AS ( + SELECT * FROM fetch_top_threads(NOW(), 100) OFFSET 0 +), +joined AS ( + SELECT posts.post_id, posts.thread_id, posts.body, posts.creation_time + FROM top JOIN posts ON posts.thread_id = top.thread_id + ORDER BY posts.creation_time DESC +), +grouped AS ( + SELECT min(joined.creation_time) op_creation_time, joined.thread_id, count(*) AS post_count FROM joined + GROUP BY joined.thread_id +) SELECT joined.*, grouped.post_count +FROM grouped JOIN joined ON op_creation_time = joined.creation_time AND grouped.thread_id = joined.thread_id; + + +WITH +top AS ( + SELECT * FROM fetch_top_threads(NOW(), 200) OFFSET 0 +), +joined AS ( + SELECT + posts.post_id, + posts.thread_id, + posts.body, + posts.creation_time, + ROW_NUMBER() OVER(PARTITION BY posts.thread_id ORDER BY posts.creation_time, posts.post_id) as rn + FROM top + JOIN posts ON posts.thread_id = top.thread_id +), +grouped AS ( + SELECT joined.post_id, joined.thread_id + FROM joined + WHERE joined.rn = 1 +) +SELECT joined.* +FROM grouped +JOIN joined ON grouped.post_id = joined.post_id; + + + +SELECT + sub.post_id, + sub.thread_id, + sub.body, + sub.creation_time +FROM +( + SELECT + posts.post_id, + posts.thread_id, + posts.body, + posts.creation_time, + ROW_NUMBER() OVER(PARTITION BY posts.thread_id ORDER BY posts.creation_time, posts.post_id) as rn + FROM fetch_top_threads(NOW(), 200) AS top + JOIN posts ON posts.thread_id = top.thread_id +) AS sub +WHERE + sub.rn = 1; + + +SELECT + sub.post_id, + sub.thread_id, + sub.body, + sub.creation_time +FROM +( + SELECT + posts.post_id, + posts.thread_id, + posts.body, + posts.creation_time, + ROW_NUMBER() OVER(PARTITION BY posts.thread_id ORDER BY posts.creation_time, posts.post_id) as rn + FROM + (SELECT * FROM fetch_top_threads(NOW(), 200) OFFSET 0) AS top + JOIN posts ON posts.thread_id = top.thread_id +) AS sub +WHERE + sub.rn = 1; + +SELECT count(*) FROM fetch_catalog(NOW(), 400, 0); + +SELECT * FROM fetch_top_threads(NOW(), 200); + + +SELECT count(*) FROM +( +SELECT top.thread_id, count(*) as post_count FROM fetch_top_threads(NOW(), 200) top JOIN posts ON posts.thread_id = top.thread_id +GROUP BY top.thread_id +) t; + +-- how do I + + +CREATE OR REPLACE FUNCTION fetch_catalog(max_time timestamptz, wanted_result_count integer, offset_count integer) +RETURNS TABLE ( + op_post_id bigint, + thread_id bigint, + post_count bigint, + body text, + creation_time timestamptz +) +LANGUAGE SQL +AS $$ +WITH +top AS ( + SELECT * FROM fetch_top_threads(max_time, wanted_result_count) OFFSET offset_count +), +joined AS ( + SELECT posts.post_id, posts.thread_id, posts.body, posts.creation_time + FROM top JOIN posts ON posts.thread_id = top.thread_id + ORDER BY posts.creation_time DESC +), +grouped AS ( + SELECT min(joined.post_id) op_post_id, joined.thread_id, count(*) AS post_count FROM joined + GROUP BY joined.thread_id +) SELECT grouped.*, joined.body, joined.creation_time FROM grouped JOIN joined ON op_post_id = joined.post_id; +$$; + + + + + diff --git a/sql/archive_tests2.sql b/sql/archive_tests2.sql new file mode 100644 index 0000000..ad66fa5 --- /dev/null +++ b/sql/archive_tests2.sql @@ -0,0 +1,216 @@ +CREATE OR REPLACE FUNCTION fetch_top_threads( + p_start_time TIMESTAMPTZ, + lookback INT DEFAULT 10000 +) +RETURNS TABLE(bump_time TIMESTAMPTZ, post_count BIGINT, thread_id BIGINT, where_to_leave_off TIMESTAMPTZ) +LANGUAGE sql +AS $$ + SELECT + max(creation_time) as bump_time, + count(*), + thread_id, + min(creation_time) as where_to_leave_off + FROM + ( + SELECT thread_id, creation_time + FROM posts + WHERE creation_time < p_start_time + ORDER BY creation_time DESC + LIMIT LEAST(lookback, 250000) -- capping the lookback to 250k + ) as t + GROUP BY thread_id + ORDER BY bump_time DESC; +$$; + + +SELECT * FROM fetch_top_threads(NOW() - INTERVAL '10d'); + +SELECT * FROM fetch_top_threads(NOW() - INTERVAL '10d') top JOIN posts ON top.thread_id = posts.thread_id; + + +SELECT + ordered_posts.thread_id, + count(*), + array_agg(ordered_posts.body) +FROM ( + SELECT top.thread_id, posts.body + FROM fetch_top_threads(NOW() - INTERVAL '10d', 2000) top + JOIN posts ON top.thread_id = posts.thread_id + ORDER BY posts.board_post_id ASC +) AS ordered_posts +GROUP BY ordered_posts.thread_id; + +WITH + top AS + ( + SELECT * FROM fetch_top_threads(NOW()) top + ), + tall_posts AS + ( + SELECT top.post_count as estimated_post_count, posts.* + FROM top + JOIN posts ON top.thread_id = posts.thread_id + ), + op_posts AS + ( + SELECT DISTINCT ON (t.thread_id) + * + FROM tall_posts t + ORDER BY t.thread_id, t.board_post_id + ) +SELECT * FROM op_posts; + + +WITH + top AS + ( + SELECT * FROM fetch_top_threads(NOW()) top + ), + tall_posts AS + ( + SELECT top.post_count as estimated_post_count, posts.* + FROM top + JOIN posts ON top.thread_id = posts.thread_id + ), + op_posts AS + ( + SELECT DISTINCT ON (t.thread_id) + * + FROM tall_posts t + ORDER BY t.thread_id, t.board_post_id + ), + post_counts AS + ( + SELECT thread_id, count(*) as post_count FROM + tall_posts + GROUP BY thread_id + ) +SELECT post_counts.post_count, op_posts.* FROM op_posts JOIN post_counts ON op_posts.thread_id = post_counts.thread_id; + + +-- 71ms! +WITH + top AS + ( + SELECT * FROM fetch_top_threads(NOW() - INTERVAL '1y', 1000) top + ), + tall_posts AS + ( + SELECT + top.post_count as estimated_post_count, + top.bump_time, + posts.post_id, + posts.board_post_id, + posts.creation_time, + posts.body, + posts.thread_id + FROM top + JOIN posts ON top.thread_id = posts.thread_id + ), + op_posts AS + ( + SELECT DISTINCT ON (t.thread_id) + * + FROM tall_posts t + ORDER BY t.thread_id, t.board_post_id + ), + post_counts AS + ( + SELECT thread_id, count(*) as post_count FROM + tall_posts + GROUP BY thread_id + ) +SELECT + post_counts.post_count, + op_posts.*, + threads.board_thread_id, + boards.pathpart, + sites."name" +FROM op_posts +JOIN post_counts ON op_posts.thread_id = post_counts.thread_id +JOIN threads ON op_posts.thread_id = threads.thread_id +JOIN boards ON threads.board_id = boards.board_id +JOIN sites ON sites.site_id = boards.site_id; + + +CREATE OR REPLACE FUNCTION fetch_catalog(max_time timestamptz, max_row_read int DEFAULT 10000) +RETURNS TABLE ( + post_count bigint, + estimated_post_count bigint, + post_id bigint, + board_post_id bigint, + creation_time timestamptz, + body text, + thread_id bigint, + board_thread_id bigint, + pathpart text, + name text +) AS $$ + WITH + top AS + ( + SELECT * FROM fetch_top_threads(max_time, max_row_read) AS top + ), + tall_posts AS + ( + SELECT + top.post_count as estimated_post_count, + posts.post_id, + posts.board_post_id, + posts.creation_time, + posts.body, + posts.thread_id + FROM top + JOIN posts ON top.thread_id = posts.thread_id + WHERE creation_time < max_time + ), + op_posts AS + ( + SELECT DISTINCT ON (t.thread_id) + * + FROM tall_posts t + ORDER BY t.thread_id, t.board_post_id + ), + post_counts AS + ( + SELECT thread_id, count(*) as post_count FROM + tall_posts + GROUP BY thread_id + ) + SELECT + post_counts.post_count, + op_posts.*, + threads.board_thread_id, + boards.pathpart, + sites."name" + FROM op_posts + JOIN post_counts ON op_posts.thread_id = post_counts.thread_id + JOIN threads ON op_posts.thread_id = threads.thread_id + JOIN boards ON threads.board_id = boards.board_id + JOIN sites ON sites.site_id = boards.site_id; +$$ LANGUAGE sql; + +SELECT * FROM fetch_catalog(NOW() - INTERVAL '1y', 1001); + + +-- CREATE INDEX idx_posts_thread_board ON posts (thread_id, board_post_id); +ANALYZE posts; + + +SELECT DISTINCT ON (top.thread_id) + top.thread_id, + posts.post_id, + posts.body, + top.post_count, + top.where_to_leave_off +FROM fetch_top_threads(NOW() - INTERVAL '10d') top +JOIN posts ON top.thread_id = posts.thread_id +ORDER BY top.thread_id, posts.board_post_id; + + +SELECT * FROM threads WHERE thread_id = 3110; +SELECT * FROM posts WHERE thread_id = 3110 ORDER BY board_post_id ASC; +SELECT * FROM boards; + +SELECT * FROM threads WHERE thread_id = 11314; +ANALYZE posts; diff --git a/sql/initialize.sql b/sql/initialize.sql index bb04b8c..d68ddbf 100644 --- a/sql/initialize.sql +++ b/sql/initialize.sql @@ -16,6 +16,9 @@ DROP TABLE IF EXISTS threads CASCADE; DROP TABLE IF EXISTS posts CASCADE; DROP TABLE IF EXISTS attachments CASCADE; DROP FUNCTION IF EXISTS update_post_body_search_index; +DROP FUNCTION IF EXISTS fetch_top_threads; +DROP FUNCTION IF EXISTS fetch_catalog; + -- It won't let us drop roles otherwise and the IFs are to keep this script idempotent. DO @@ -73,6 +76,7 @@ CREATE INDEX posts_body_search_idx ON posts USING GIN (body_search_index); CREATE INDEX posts_thread_id_idx ON posts (thread_id); CREATE INDEX posts_board_post_id_idx ON posts (board_post_id); CREATE INDEX posts_thread_id_creation_time_idx ON posts (creation_time, thread_id); +--CREATE INDEX posts_thread_id_board_post_id_idx ON posts (thread_id, board_post_id); CREATE OR REPLACE FUNCTION update_post_body_search_index() RETURNS trigger AS $$ BEGIN @@ -167,6 +171,89 @@ $$ LANGUAGE sql; -- 1:21 for full db (nothing inserted) +CREATE OR REPLACE FUNCTION fetch_top_threads( + p_start_time TIMESTAMPTZ, + lookback INT DEFAULT 10000 +) +RETURNS TABLE(bump_time TIMESTAMPTZ, post_count BIGINT, thread_id BIGINT, where_to_leave_off TIMESTAMPTZ) +LANGUAGE sql +AS $$ + SELECT + max(creation_time) as bump_time, + count(*), + thread_id, + min(creation_time) as where_to_leave_off + FROM + ( + SELECT thread_id, creation_time + FROM posts + WHERE creation_time < p_start_time + ORDER BY creation_time DESC + LIMIT LEAST(lookback, 250000) -- capping the lookback to 250k + ) as t + GROUP BY thread_id + ORDER BY bump_time DESC; +$$; + + +CREATE OR REPLACE FUNCTION fetch_catalog(max_time timestamptz, max_row_read int DEFAULT 10000) +RETURNS TABLE ( + post_count bigint, + estimated_post_count bigint, + post_id bigint, + board_post_id bigint, + creation_time timestamptz, + body text, + thread_id bigint, + board_thread_id bigint, + pathpart text, + name text +) AS $$ + WITH + top AS + ( + SELECT * FROM fetch_top_threads(max_time, max_row_read) AS top + ), + tall_posts AS + ( + SELECT + top.post_count as estimated_post_count, + posts.post_id, + posts.board_post_id, + posts.creation_time, + posts.body, + posts.thread_id + FROM top + JOIN posts ON top.thread_id = posts.thread_id + WHERE creation_time < max_time + ), + op_posts AS + ( + SELECT DISTINCT ON (t.thread_id) + * + FROM tall_posts t + ORDER BY t.thread_id, t.board_post_id + ), + post_counts AS + ( + SELECT thread_id, count(*) as post_count FROM + tall_posts + GROUP BY thread_id + ) + SELECT + post_counts.post_count, + op_posts.*, + threads.board_thread_id, + boards.pathpart, + sites."name" + FROM op_posts + JOIN post_counts ON op_posts.thread_id = post_counts.thread_id + JOIN threads ON op_posts.thread_id = threads.thread_id + JOIN boards ON threads.board_id = boards.board_id + JOIN sites ON sites.site_id = boards.site_id; +$$ LANGUAGE sql; + + /* * Permissions */ @@ -192,6 +279,8 @@ GRANT ALL ON posts TO chan_archiver; GRANT ALL ON attachments TO chan_archiver; GRANT EXECUTE ON FUNCTION update_post_body_search_index TO chan_archiver; GRANT EXECUTE ON FUNCTION insert_posts_and_return_ids TO chan_archiver; +GRANT EXECUTE ON FUNCTION fetch_top_threads TO chan_archiver; +GRANT EXECUTE ON FUNCTION fetch_catalog TO chan_archiver; GRANT usage, select ON SEQUENCE sites_site_id_seq TO chan_archiver; GRANT usage, select ON SEQUENCE boards_board_id_seq TO chan_archiver; GRANT usage, select ON SEQUENCE threads_thread_id_seq TO chan_archiver;