- Change fetch_catalog and fetch_top_threads functions

- neither was correct
    - fetch_top_threads doesn't need to be a loop, it will be up to the
      application whether or not to load more results. This simplifies
      the function, which was sometimes returning more than one row for
      each thread_id anyway
    - fetch_catalog is now fairly performant, and fetches all relevant
      information
This commit is contained in:
towards-a-new-leftypol 2023-10-26 18:19:32 -04:00
parent 7b8b1b2255
commit c297bae697
3 changed files with 496 additions and 9 deletions

View File

@ -28,7 +28,7 @@ LIMIT
100;
SELECT
SELECT
thread_data.thread_id,
thread_data.bump_time,
thread_data.first_post_subject,
@ -313,6 +313,16 @@ group by thread_id
order by bump_time desc;
select max(creation_time) as bump_time, count(*), thread_id, min(creation_time) as where_to_leave_off
from
(
select thread_id, creation_time, body from posts
where creation_time < NOW() - interval '365 day'
order by creation_time desc limit 10000
) as t
group by thread_id
order by bump_time desc;
explain analyze SELECT
@ -330,8 +340,8 @@ LIMIT 100
-- Create temp table if not exists
drop table if exists temp_results;
CREATE TEMP TABLE IF NOT EXISTS temp_results (bump_time TIMESTAMPTZ, post_count INT, thread_id INT, last_fetched_time TIMESTAMP);
DROP TABLE IF EXISTS temp_results;
/*
* This function scans backwards from p_start_time until we get the desired number of threads.
@ -351,7 +361,8 @@ DECLARE
max_iterations INT := 10; -- Maximum iterations to avoid endless loop in case of errors
result_count INT := 0;
last_min_time TIMESTAMP;
BEGIN
BEGIN
CREATE TEMP TABLE IF NOT EXISTS temp_results (bump_time TIMESTAMPTZ, post_count INT, thread_id INT, last_fetched_time TIMESTAMP);
TRUNCATE temp_results; -- clear the table
FOR i IN 1..max_iterations LOOP
@ -368,9 +379,7 @@ BEGIN
GROUP BY t.thread_id;
-- Check if we have enough threads
--SELECT temp_results.thread_id INTO result_count FROM temp_results;
SELECT COUNT(DISTINCT temp_results.thread_id) INTO result_count FROM temp_results; -- specify the table name here to avoid ambiguity
SELECT COUNT(DISTINCT temp_results.thread_id) INTO result_count FROM temp_results;
IF result_count >= p_desired_threads THEN
EXIT;
@ -387,13 +396,14 @@ BEGIN
-- Return the results
RETURN QUERY SELECT temp_results.bump_time, temp_results.post_count, temp_results.thread_id FROM temp_results ORDER BY temp_results.bump_time DESC;
DROP TABLE IF EXISTS temp_results;
END;
$$;
select * FROM fetch_top_threads(NOW(), 300) offset 0;
select * FROM fetch_top_threads(NOW(), 100);
explain analyze WITH TopThreads AS (
WITH TopThreads AS (
SELECT * FROM fetch_top_threads(NOW() - interval '365d', 100)
)
SELECT
@ -409,3 +419,175 @@ JOIN
posts p ON tt.thread_id = p.thread_id
ORDER BY
p.creation_time asc;
SELECT *
FROM
fetch_top_threads(NOW() - interval '365d', 100) top
JOIN
posts ON posts.thread_id = top.thread_id
ORDER BY posts.creation_time ASC;
SELECT
post_id,
board_post_id,
posts.thread_id,
bump_time,
creation_time,
body
FROM
fetch_top_threads(NOW() - interval '365d', 100) top
JOIN
posts ON posts.thread_id = top.thread_id
ORDER BY posts.creation_time ASC;
VACUUM ANALYZE posts;
SELECT posts.post_id, posts.thread_id, posts.body
FROM
fetch_top_threads(NOW(), 100) top
JOIN
posts ON posts.thread_id = top.thread_id;
WITH
top AS (
SELECT * FROM fetch_top_threads(NOW(), 100) top
),
joined AS (
SELECT posts.post_id, posts.thread_id, posts.body
FROM top JOIN posts ON posts.thread_id = top.thread_id
ORDER BY posts.creation_time DESC
),
grouped AS (
SELECT min(joined.post_id) op_post_id, joined.thread_id FROM joined
GROUP BY joined.thread_id
) SELECT * FROM grouped JOIN joined ON op_post_id = joined.post_id;
WITH
top AS (
SELECT * FROM fetch_top_threads(NOW(), 100) OFFSET 0
),
joined AS (
SELECT posts.post_id, posts.thread_id, posts.body, posts.creation_time
FROM top JOIN posts ON posts.thread_id = top.thread_id
ORDER BY posts.creation_time DESC
),
grouped AS (
SELECT min(joined.creation_time) op_creation_time, joined.thread_id, count(*) AS post_count FROM joined
GROUP BY joined.thread_id
) SELECT joined.*, grouped.post_count
FROM grouped JOIN joined ON op_creation_time = joined.creation_time AND grouped.thread_id = joined.thread_id;
WITH
top AS (
SELECT * FROM fetch_top_threads(NOW(), 200) OFFSET 0
),
joined AS (
SELECT
posts.post_id,
posts.thread_id,
posts.body,
posts.creation_time,
ROW_NUMBER() OVER(PARTITION BY posts.thread_id ORDER BY posts.creation_time, posts.post_id) as rn
FROM top
JOIN posts ON posts.thread_id = top.thread_id
),
grouped AS (
SELECT joined.post_id, joined.thread_id
FROM joined
WHERE joined.rn = 1
)
SELECT joined.*
FROM grouped
JOIN joined ON grouped.post_id = joined.post_id;
SELECT
sub.post_id,
sub.thread_id,
sub.body,
sub.creation_time
FROM
(
SELECT
posts.post_id,
posts.thread_id,
posts.body,
posts.creation_time,
ROW_NUMBER() OVER(PARTITION BY posts.thread_id ORDER BY posts.creation_time, posts.post_id) as rn
FROM fetch_top_threads(NOW(), 200) AS top
JOIN posts ON posts.thread_id = top.thread_id
) AS sub
WHERE
sub.rn = 1;
SELECT
sub.post_id,
sub.thread_id,
sub.body,
sub.creation_time
FROM
(
SELECT
posts.post_id,
posts.thread_id,
posts.body,
posts.creation_time,
ROW_NUMBER() OVER(PARTITION BY posts.thread_id ORDER BY posts.creation_time, posts.post_id) as rn
FROM
(SELECT * FROM fetch_top_threads(NOW(), 200) OFFSET 0) AS top
JOIN posts ON posts.thread_id = top.thread_id
) AS sub
WHERE
sub.rn = 1;
SELECT count(*) FROM fetch_catalog(NOW(), 400, 0);
SELECT * FROM fetch_top_threads(NOW(), 200);
SELECT count(*) FROM
(
SELECT top.thread_id, count(*) as post_count FROM fetch_top_threads(NOW(), 200) top JOIN posts ON posts.thread_id = top.thread_id
GROUP BY top.thread_id
) t;
-- how do I
CREATE OR REPLACE FUNCTION fetch_catalog(max_time timestamptz, wanted_result_count integer, offset_count integer)
RETURNS TABLE (
op_post_id bigint,
thread_id bigint,
post_count bigint,
body text,
creation_time timestamptz
)
LANGUAGE SQL
AS $$
WITH
top AS (
SELECT * FROM fetch_top_threads(max_time, wanted_result_count) OFFSET offset_count
),
joined AS (
SELECT posts.post_id, posts.thread_id, posts.body, posts.creation_time
FROM top JOIN posts ON posts.thread_id = top.thread_id
ORDER BY posts.creation_time DESC
),
grouped AS (
SELECT min(joined.post_id) op_post_id, joined.thread_id, count(*) AS post_count FROM joined
GROUP BY joined.thread_id
) SELECT grouped.*, joined.body, joined.creation_time FROM grouped JOIN joined ON op_post_id = joined.post_id;
$$;

216
sql/archive_tests2.sql Normal file
View File

@ -0,0 +1,216 @@
CREATE OR REPLACE FUNCTION fetch_top_threads(
p_start_time TIMESTAMPTZ,
lookback INT DEFAULT 10000
)
RETURNS TABLE(bump_time TIMESTAMPTZ, post_count BIGINT, thread_id BIGINT, where_to_leave_off TIMESTAMPTZ)
LANGUAGE sql
AS $$
SELECT
max(creation_time) as bump_time,
count(*),
thread_id,
min(creation_time) as where_to_leave_off
FROM
(
SELECT thread_id, creation_time
FROM posts
WHERE creation_time < p_start_time
ORDER BY creation_time DESC
LIMIT LEAST(lookback, 250000) -- capping the lookback to 250k
) as t
GROUP BY thread_id
ORDER BY bump_time DESC;
$$;
SELECT * FROM fetch_top_threads(NOW() - INTERVAL '10d');
SELECT * FROM fetch_top_threads(NOW() - INTERVAL '10d') top JOIN posts ON top.thread_id = posts.thread_id;
SELECT
ordered_posts.thread_id,
count(*),
array_agg(ordered_posts.body)
FROM (
SELECT top.thread_id, posts.body
FROM fetch_top_threads(NOW() - INTERVAL '10d', 2000) top
JOIN posts ON top.thread_id = posts.thread_id
ORDER BY posts.board_post_id ASC
) AS ordered_posts
GROUP BY ordered_posts.thread_id;
WITH
top AS
(
SELECT * FROM fetch_top_threads(NOW()) top
),
tall_posts AS
(
SELECT top.post_count as estimated_post_count, posts.*
FROM top
JOIN posts ON top.thread_id = posts.thread_id
),
op_posts AS
(
SELECT DISTINCT ON (t.thread_id)
*
FROM tall_posts t
ORDER BY t.thread_id, t.board_post_id
)
SELECT * FROM op_posts;
WITH
top AS
(
SELECT * FROM fetch_top_threads(NOW()) top
),
tall_posts AS
(
SELECT top.post_count as estimated_post_count, posts.*
FROM top
JOIN posts ON top.thread_id = posts.thread_id
),
op_posts AS
(
SELECT DISTINCT ON (t.thread_id)
*
FROM tall_posts t
ORDER BY t.thread_id, t.board_post_id
),
post_counts AS
(
SELECT thread_id, count(*) as post_count FROM
tall_posts
GROUP BY thread_id
)
SELECT post_counts.post_count, op_posts.* FROM op_posts JOIN post_counts ON op_posts.thread_id = post_counts.thread_id;
-- 71ms!
WITH
top AS
(
SELECT * FROM fetch_top_threads(NOW() - INTERVAL '1y', 1000) top
),
tall_posts AS
(
SELECT
top.post_count as estimated_post_count,
top.bump_time,
posts.post_id,
posts.board_post_id,
posts.creation_time,
posts.body,
posts.thread_id
FROM top
JOIN posts ON top.thread_id = posts.thread_id
),
op_posts AS
(
SELECT DISTINCT ON (t.thread_id)
*
FROM tall_posts t
ORDER BY t.thread_id, t.board_post_id
),
post_counts AS
(
SELECT thread_id, count(*) as post_count FROM
tall_posts
GROUP BY thread_id
)
SELECT
post_counts.post_count,
op_posts.*,
threads.board_thread_id,
boards.pathpart,
sites."name"
FROM op_posts
JOIN post_counts ON op_posts.thread_id = post_counts.thread_id
JOIN threads ON op_posts.thread_id = threads.thread_id
JOIN boards ON threads.board_id = boards.board_id
JOIN sites ON sites.site_id = boards.site_id;
CREATE OR REPLACE FUNCTION fetch_catalog(max_time timestamptz, max_row_read int DEFAULT 10000)
RETURNS TABLE (
post_count bigint,
estimated_post_count bigint,
post_id bigint,
board_post_id bigint,
creation_time timestamptz,
body text,
thread_id bigint,
board_thread_id bigint,
pathpart text,
name text
) AS $$
WITH
top AS
(
SELECT * FROM fetch_top_threads(max_time, max_row_read) AS top
),
tall_posts AS
(
SELECT
top.post_count as estimated_post_count,
posts.post_id,
posts.board_post_id,
posts.creation_time,
posts.body,
posts.thread_id
FROM top
JOIN posts ON top.thread_id = posts.thread_id
WHERE creation_time < max_time
),
op_posts AS
(
SELECT DISTINCT ON (t.thread_id)
*
FROM tall_posts t
ORDER BY t.thread_id, t.board_post_id
),
post_counts AS
(
SELECT thread_id, count(*) as post_count FROM
tall_posts
GROUP BY thread_id
)
SELECT
post_counts.post_count,
op_posts.*,
threads.board_thread_id,
boards.pathpart,
sites."name"
FROM op_posts
JOIN post_counts ON op_posts.thread_id = post_counts.thread_id
JOIN threads ON op_posts.thread_id = threads.thread_id
JOIN boards ON threads.board_id = boards.board_id
JOIN sites ON sites.site_id = boards.site_id;
$$ LANGUAGE sql;
SELECT * FROM fetch_catalog(NOW() - INTERVAL '1y', 1001);
-- CREATE INDEX idx_posts_thread_board ON posts (thread_id, board_post_id);
ANALYZE posts;
SELECT DISTINCT ON (top.thread_id)
top.thread_id,
posts.post_id,
posts.body,
top.post_count,
top.where_to_leave_off
FROM fetch_top_threads(NOW() - INTERVAL '10d') top
JOIN posts ON top.thread_id = posts.thread_id
ORDER BY top.thread_id, posts.board_post_id;
SELECT * FROM threads WHERE thread_id = 3110;
SELECT * FROM posts WHERE thread_id = 3110 ORDER BY board_post_id ASC;
SELECT * FROM boards;
SELECT * FROM threads WHERE thread_id = 11314;
ANALYZE posts;

View File

@ -16,6 +16,9 @@ DROP TABLE IF EXISTS threads CASCADE;
DROP TABLE IF EXISTS posts CASCADE;
DROP TABLE IF EXISTS attachments CASCADE;
DROP FUNCTION IF EXISTS update_post_body_search_index;
DROP FUNCTION IF EXISTS fetch_top_threads;
DROP FUNCTION IF EXISTS fetch_catalog;
-- It won't let us drop roles otherwise and the IFs are to keep this script idempotent.
DO
@ -73,6 +76,7 @@ CREATE INDEX posts_body_search_idx ON posts USING GIN (body_search_index);
CREATE INDEX posts_thread_id_idx ON posts (thread_id);
CREATE INDEX posts_board_post_id_idx ON posts (board_post_id);
CREATE INDEX posts_thread_id_creation_time_idx ON posts (creation_time, thread_id);
--CREATE INDEX posts_thread_id_board_post_id_idx ON posts (thread_id, board_post_id);
CREATE OR REPLACE FUNCTION update_post_body_search_index() RETURNS trigger AS $$
BEGIN
@ -167,6 +171,89 @@ $$ LANGUAGE sql;
-- 1:21 for full db (nothing inserted)
CREATE OR REPLACE FUNCTION fetch_top_threads(
p_start_time TIMESTAMPTZ,
lookback INT DEFAULT 10000
)
RETURNS TABLE(bump_time TIMESTAMPTZ, post_count BIGINT, thread_id BIGINT, where_to_leave_off TIMESTAMPTZ)
LANGUAGE sql
AS $$
SELECT
max(creation_time) as bump_time,
count(*),
thread_id,
min(creation_time) as where_to_leave_off
FROM
(
SELECT thread_id, creation_time
FROM posts
WHERE creation_time < p_start_time
ORDER BY creation_time DESC
LIMIT LEAST(lookback, 250000) -- capping the lookback to 250k
) as t
GROUP BY thread_id
ORDER BY bump_time DESC;
$$;
CREATE OR REPLACE FUNCTION fetch_catalog(max_time timestamptz, max_row_read int DEFAULT 10000)
RETURNS TABLE (
post_count bigint,
estimated_post_count bigint,
post_id bigint,
board_post_id bigint,
creation_time timestamptz,
body text,
thread_id bigint,
board_thread_id bigint,
pathpart text,
name text
) AS $$
WITH
top AS
(
SELECT * FROM fetch_top_threads(max_time, max_row_read) AS top
),
tall_posts AS
(
SELECT
top.post_count as estimated_post_count,
posts.post_id,
posts.board_post_id,
posts.creation_time,
posts.body,
posts.thread_id
FROM top
JOIN posts ON top.thread_id = posts.thread_id
WHERE creation_time < max_time
),
op_posts AS
(
SELECT DISTINCT ON (t.thread_id)
*
FROM tall_posts t
ORDER BY t.thread_id, t.board_post_id
),
post_counts AS
(
SELECT thread_id, count(*) as post_count FROM
tall_posts
GROUP BY thread_id
)
SELECT
post_counts.post_count,
op_posts.*,
threads.board_thread_id,
boards.pathpart,
sites."name"
FROM op_posts
JOIN post_counts ON op_posts.thread_id = post_counts.thread_id
JOIN threads ON op_posts.thread_id = threads.thread_id
JOIN boards ON threads.board_id = boards.board_id
JOIN sites ON sites.site_id = boards.site_id;
$$ LANGUAGE sql;
/*
* Permissions
*/
@ -192,6 +279,8 @@ GRANT ALL ON posts TO chan_archiver;
GRANT ALL ON attachments TO chan_archiver;
GRANT EXECUTE ON FUNCTION update_post_body_search_index TO chan_archiver;
GRANT EXECUTE ON FUNCTION insert_posts_and_return_ids TO chan_archiver;
GRANT EXECUTE ON FUNCTION fetch_top_threads TO chan_archiver;
GRANT EXECUTE ON FUNCTION fetch_catalog TO chan_archiver;
GRANT usage, select ON SEQUENCE sites_site_id_seq TO chan_archiver;
GRANT usage, select ON SEQUENCE boards_board_id_seq TO chan_archiver;
GRANT usage, select ON SEQUENCE threads_thread_id_seq TO chan_archiver;