diff --git a/sql/archive_tests.sql b/sql/archive_tests.sql new file mode 100644 index 0000000..7b9dce8 --- /dev/null +++ b/sql/archive_tests.sql @@ -0,0 +1,411 @@ +select count(*) from posts; + +select board_id, count(*) as post_count from posts natural join threads group by board_id; + +select sum(post_count) from (select board_id, count(*) as post_count from posts join threads on posts.thread_id = threads.thread_id group by threads.board_id) as t; + +select board_id, count(*) from threads natural join boards group by board_id; + +select count(*) from posts; + +SELECT + threads.thread_id, + MAX(posts.creation_time) AS bump_time, + -- FIRST_VALUE(posts.subject) OVER (PARTITION BY threads.thread_id ORDER BY posts.creation_time) AS first_post_subject, + FIRST_VALUE(posts.body) OVER (PARTITION BY threads.thread_id ORDER BY posts.creation_time) AS first_post_body, + COUNT(*) AS post_count +FROM + threads +JOIN + posts ON threads.thread_id = posts.thread_id +WHERE + posts.creation_time < NOW() -- Specify your desired time here +GROUP BY + threads.thread_id +ORDER BY + bump_time DESC +LIMIT + 100; + + + SELECT + thread_data.thread_id, + thread_data.bump_time, + thread_data.first_post_subject, + thread_data.first_post_body, + COUNT(*) AS post_count +FROM ( + SELECT + threads.thread_id, + MAX(posts.creation_time) AS bump_time, + -- FIRST_VALUE(posts.subject) OVER (PARTITION BY threads.thread_id ORDER BY posts.creation_time) AS first_post_subject, + FIRST_VALUE(posts.body) OVER (PARTITION BY threads.thread_id ORDER BY posts.creation_time) AS first_post_body, + posts.creation_time -- This can be removed if it's not needed in outer query + FROM + threads + JOIN + posts ON threads.thread_id = posts.thread_id + WHERE + posts.creation_time < NOW() -- Specify your desired time here +) AS thread_data +GROUP BY + thread_data.thread_id, thread_data.bump_time, thread_data.first_post_subject, thread_data.first_post_body +ORDER BY + thread_data.bump_time DESC +LIMIT + 100; + + +explain ANALYZE select + +-- runs in 1.5s +SELECT + thread_data.thread_id, + MAX(thread_data.creation_time) AS bump_time, + MIN(thread_data.first_post_body) AS first_post_body, -- Getting the first post's body per thread using MIN as a trick since FIRST_VALUE is the same for all rows per thread in the subquery + COUNT(*) AS post_count +FROM ( + SELECT + threads.thread_id, + posts.creation_time, + FIRST_VALUE(posts.body) OVER (PARTITION BY threads.thread_id ORDER BY posts.creation_time) AS first_post_body + FROM + threads + JOIN + posts ON threads.thread_id = posts.thread_id + WHERE + posts.creation_time < NOW() -- Specify your desired time here +) AS thread_data +GROUP BY + thread_data.thread_id +ORDER BY + bump_time DESC +LIMIT + 100; + + +-- runs in 640ms +WITH LatestPosts AS ( + -- Get the latest post time per thread + SELECT + p.thread_id, + MAX(p.creation_time) AS bump_time + FROM + posts p + WHERE + p.creation_time < NOW() - INTERVAL '365 day' + GROUP BY + p.thread_id + ORDER BY + bump_time DESC + LIMIT 100 +), +FirstPost AS ( + -- Get the first post's body per thread + SELECT + p.thread_id, + p.body AS first_post_body + FROM + posts p + JOIN + LatestPosts lp ON p.thread_id = lp.thread_id + WHERE + p.creation_time = ( + SELECT MIN(p2.creation_time) + FROM posts p2 + WHERE p2.thread_id = p.thread_id + ) +) +SELECT + lp.thread_id, + lp.bump_time, + fp.first_post_body, + COUNT(p.post_id) AS post_count +FROM + LatestPosts lp +JOIN + FirstPost fp ON lp.thread_id = fp.thread_id +JOIN + posts p ON lp.thread_id = p.thread_id +GROUP BY + lp.thread_id, lp.bump_time, fp.first_post_body +ORDER BY + lp.bump_time DESC; + + + + + + + + + + + + -- runs in 1.2s + WITH LatestPosts AS ( + -- Get the latest post time per thread + SELECT + p.thread_id, + MAX(p.creation_time) AS bump_time + FROM + posts p + WHERE + p.creation_time < NOW() - INTERVAL '365 day' + GROUP BY + p.thread_id + ORDER BY + bump_time DESC + LIMIT 100 +), +FirstPost AS ( + -- Get the first post's body per thread + SELECT + p.thread_id, + p.body AS first_post_body + FROM ( + SELECT + thread_id, + body, + RANK() OVER(PARTITION BY thread_id ORDER BY creation_time) as rnk + FROM + posts + ) p + WHERE + rnk = 1 + AND + p.thread_id IN (SELECT thread_id FROM LatestPosts) -- Optional: limit rows to those from LatestPosts CTE +) +SELECT + lp.thread_id, + lp.bump_time, + fp.first_post_body, + COUNT(p.post_id) AS post_count +FROM + LatestPosts lp +JOIN + FirstPost fp ON lp.thread_id = fp.thread_id +JOIN + posts p ON lp.thread_id = p.thread_id +GROUP BY + lp.thread_id, lp.bump_time, fp.first_post_body +ORDER BY + lp.bump_time DESC; + + + +-- runs in ~420ms +WITH LatestPosts AS ( + -- Get the latest post time per thread + SELECT + p.thread_id, + MAX(p.creation_time) AS bump_time + FROM + posts p + WHERE + p.creation_time < NOW() - interval '365 day' + GROUP BY + p.thread_id + ORDER BY + bump_time DESC + LIMIT 100 +) +-- Main Query +SELECT + lp.thread_id, + lp.bump_time, + fp.first_post_body, + COUNT(p.post_id) AS post_count +FROM + LatestPosts lp +JOIN LATERAL ( + SELECT p.body AS first_post_body + FROM posts p + WHERE p.thread_id = lp.thread_id + ORDER BY p.creation_time + LIMIT 1 +) fp ON TRUE +JOIN + posts p ON lp.thread_id = p.thread_id +GROUP BY + lp.thread_id, lp.bump_time, fp.first_post_body +ORDER BY + lp.bump_time DESC; + + +select sum(counts) from +( +select max(creation_time) as bump_time, count(*) as counts, thread_id +from + ( + select thread_id, creation_time from posts + where creation_time < NOW() + order by creation_time desc + ) as t +group by thread_id +order by bump_time desc +limit 2 +) as t; + + + +-- actually we don't need these, because "first" would not be the OP it would be the most recent post. We should just join the results with post +-- to get the OPs + +-- Create a function that always returns the first non-NULL value: +CREATE OR REPLACE FUNCTION public.first_agg (anyelement, anyelement) + RETURNS anyelement + LANGUAGE sql IMMUTABLE STRICT PARALLEL SAFE AS +'SELECT $1'; + +-- Then wrap an aggregate around it: +CREATE AGGREGATE public.first (anyelement) ( + SFUNC = public.first_agg +, STYPE = anyelement +, PARALLEL = safe +); + +------------ + + +( +select max(creation_time) as bump_time, count(*), thread_id, min(creation_time) as where_to_leave_off +from + ( + select thread_id, creation_time, body from posts + where creation_time < NOW() - interval '365 day' + order by creation_time desc limit 1500 + + ) as t +group by thread_id +order by bump_time desc +) + +union all + +( +select max(creation_time) as bump_time, count(*), thread_id, min(creation_time) as where_to_leave_off +from + ( + select thread_id, creation_time, body from posts + where creation_time < '2022-10-04 12:31:25.000 -0400' + order by creation_time desc limit 500 + ) as t +group by thread_id +order by bump_time desc +); + + +-- runs in 8ms +select max(creation_time) as bump_time, count(*), thread_id, min(creation_time) as where_to_leave_off +from + ( + (select thread_id, creation_time, body from posts + where creation_time < NOW() - interval '365 day' + order by creation_time desc limit 1500) + union all + (select thread_id, creation_time, body from posts + where creation_time < '2022-10-04 12:31:25.000 -0400' + order by creation_time desc limit 500) + + ) as t +group by thread_id +order by bump_time desc; + + + + +explain analyze SELECT + p.thread_id, + MAX(p.creation_time) AS bump_time + FROM + posts p + WHERE + p.creation_time < NOW() - interval '365 day' +GROUP BY + p.thread_id +ORDER BY + bump_time DESC +LIMIT 100 + + +-- Create temp table if not exists +drop table if exists temp_results; +CREATE TEMP TABLE IF NOT EXISTS temp_results (bump_time TIMESTAMPTZ, post_count INT, thread_id INT, last_fetched_time TIMESTAMP); + +/* + * This function scans backwards from p_start_time until we get the desired number of threads. + * Since the posts are ordered by creation time, the first post encountered for a particular + * thread will have that thread's bump time. Any posts after that will have a timestamp + * further back in time, so thread ids that we encounter will not change order the + * more we scan. + * + * This is to avoid a sequence scan on the entire posts table... + */ +CREATE OR REPLACE FUNCTION fetch_top_threads(p_start_time TIMESTAMPTZ, p_desired_threads INT) +RETURNS TABLE(bump_time TIMESTAMPTZ, post_count INT, thread_id INT) +LANGUAGE plpgsql +AS $$ +DECLARE + limit_size INT := 2000; -- Start with 2000 posts + max_iterations INT := 10; -- Maximum iterations to avoid endless loop in case of errors + result_count INT := 0; + last_min_time TIMESTAMP; +BEGIN + TRUNCATE temp_results; -- clear the table + + FOR i IN 1..max_iterations LOOP + -- Fetch posts starting from the previous min time + INSERT INTO temp_results(bump_time, post_count, thread_id, last_fetched_time) + SELECT max(t.creation_time) as bump_time, count(*) as post_count, t.thread_id, min(t.creation_time) as last_fetched_time + FROM ( + SELECT p.thread_id, p.creation_time + FROM posts p + WHERE p.creation_time < COALESCE(last_min_time, p_start_time) + ORDER BY p.creation_time DESC + LIMIT limit_size + ) as t + GROUP BY t.thread_id; + + -- Check if we have enough threads + --SELECT temp_results.thread_id INTO result_count FROM temp_results; + SELECT COUNT(DISTINCT temp_results.thread_id) INTO result_count FROM temp_results; -- specify the table name here to avoid ambiguity + + + IF result_count >= p_desired_threads THEN + EXIT; + END IF; + + -- Get the last min time from the temp table for the next iteration + SELECT MIN(last_fetched_time) INTO last_min_time FROM temp_results; + + -- Double the limit for the next iteration + limit_size := limit_size * 2; + + END LOOP; + + -- Return the results + RETURN QUERY SELECT temp_results.bump_time, temp_results.post_count, temp_results.thread_id FROM temp_results ORDER BY temp_results.bump_time DESC; + +END; +$$; + + +select * FROM fetch_top_threads(NOW(), 300) offset 0; + +explain analyze WITH TopThreads AS ( + SELECT * FROM fetch_top_threads(NOW() - interval '365d', 100) +) +SELECT + tt.thread_id, + tt.bump_time, + tt.post_count, + p.creation_time, + p.body, + p.post_id +FROM + TopThreads tt +JOIN + posts p ON tt.thread_id = p.thread_id +ORDER BY + p.creation_time asc; diff --git a/sql/initialize.sql b/sql/initialize.sql index b07a684..bb04b8c 100644 --- a/sql/initialize.sql +++ b/sql/initialize.sql @@ -72,9 +72,7 @@ CREATE INDEX posts_creation_time_idx ON posts (creation_time); CREATE INDEX posts_body_search_idx ON posts USING GIN (body_search_index); CREATE INDEX posts_thread_id_idx ON posts (thread_id); CREATE INDEX posts_board_post_id_idx ON posts (board_post_id); - --- This is to optimize joins on thread_id and filtering/sorting by creation_time in 'posts' table. -CREATE INDEX posts_thread_id_creation_time_idx ON posts (thread_id, creation_time); +CREATE INDEX posts_thread_id_creation_time_idx ON posts (creation_time, thread_id); CREATE OR REPLACE FUNCTION update_post_body_search_index() RETURNS trigger AS $$ BEGIN