From 9e3c054c53bb0ca66da67cc2537eb68a52bbcaf3 Mon Sep 17 00:00:00 2001 From: towards-a-new-leftypol Date: Mon, 19 Feb 2024 14:51:56 -0500 Subject: [PATCH] sql: improve search_posts function --- sql/archive_tests.sql | 24 ++++++--- sql/archive_tests2.sql | 4 +- sql/fix_search_posts.sql | 28 ++++++++++ sql/initialize.sql | 24 ++++++--- sql/recreate_attachments_table.sql | 40 ++++++++++++++ sql/remake_fetch_catalog.sql | 87 ++++++++++++++++++++++++++++++ 6 files changed, 192 insertions(+), 15 deletions(-) create mode 100644 sql/fix_search_posts.sql create mode 100644 sql/recreate_attachments_table.sql create mode 100644 sql/remake_fetch_catalog.sql diff --git a/sql/archive_tests.sql b/sql/archive_tests.sql index 21a9d99..e063078 100644 --- a/sql/archive_tests.sql +++ b/sql/archive_tests.sql @@ -592,12 +592,15 @@ SELECT * FROM posts WHERE body_search_index @@ websearch_to_tsquery('english', ' WITH query AS ( - SELECT websearch_to_tsquery('english', 'Anonymous') AS query + SELECT websearch_to_tsquery('english', 'sound cringe irl') AS query ) -SELECT p.*, ts_rank(p.body_search_index, query.query) AS relevance -FROM posts p, query +SELECT p.*, pathpart, board_thread_id, + ts_rank(p.body_search_index, query.query) + / (1 + EXTRACT(EPOCH FROM AGE(p.creation_time)) / (3600 * 24)) AS relevance +FROM posts p JOIN threads ON threads.thread_id = p.thread_id JOIN boards ON boards.board_id = threads.board_id, query WHERE p.body_search_index @@ query.query -ORDER BY relevance DESC; +ORDER BY relevance +DESC; CREATE OR REPLACE FUNCTION search_posts(search_text TEXT) @@ -606,11 +609,15 @@ AS $$ SELECT p.* FROM posts p WHERE p.body_search_index @@ websearch_to_tsquery('english', search_text) -ORDER BY ts_rank(p.body_search_index, websearch_to_tsquery('english', search_text)) DESC; +ORDER BY + ts_rank(p.body_search_index, websearch_to_tsquery('english', search_text)) + / + (1 + EXTRACT(EPOCH FROM AGE(p.creation_time)) / (3600 * 24)) + DESC; $$ LANGUAGE sql STABLE; -SELECT * FROM search_posts('alt chans') s +SELECT * FROM search_posts('archive') s JOIN threads ON threads.thread_id = s.thread_id JOIN boards ON boards.board_id = threads.board_id; @@ -641,3 +648,8 @@ SET body_search_index = ( ) WHERE board_post_id = 476524; +SELECT count(*) +FROM posts +WHERE creation_time >= CURRENT_DATE AT TIME ZONE 'UTC' + AND creation_time < (CURRENT_DATE + INTERVAL '1 day') AT TIME ZONE 'UTC'; + diff --git a/sql/archive_tests2.sql b/sql/archive_tests2.sql index 7a48bbf..690c81e 100644 --- a/sql/archive_tests2.sql +++ b/sql/archive_tests2.sql @@ -23,7 +23,7 @@ AS $$ $$; -SELECT * FROM fetch_top_threads(NOW() - INTERVAL '10d'); +SELECT * FROM fetch_top_threads(NOW(), 1000); SELECT * FROM fetch_top_threads(NOW() - INTERVAL '10d') top JOIN posts ON top.thread_id = posts.thread_id; @@ -197,7 +197,7 @@ $$ LANGUAGE sql; SELECT * FROM fetch_catalog(NOW() - INTERVAL '1y', 1001); -SELECT * FROM fetch_catalog(NOW(), 5000); +SELECT * FROM fetch_catalog(NOW(), 1000); SELECT count(*) FROM posts; diff --git a/sql/fix_search_posts.sql b/sql/fix_search_posts.sql new file mode 100644 index 0000000..8a6fa9f --- /dev/null +++ b/sql/fix_search_posts.sql @@ -0,0 +1,28 @@ +BEGIN TRANSACTION; + +DROP FUNCTION IF EXISTS search_posts; + +CREATE OR REPLACE FUNCTION search_posts(search_text text) +RETURNS TABLE ( + post posts, + pathpart text, + board_thread_id bigint, + relevance double precision +) AS $$ +WITH query AS ( + SELECT websearch_to_tsquery('english', search_text) AS query +) +SELECT p::posts AS post, pathpart, board_thread_id, + ts_rank(p.body_search_index, query.query) + / (1 + EXTRACT(EPOCH FROM AGE(p.creation_time)) / (3600 * 24)) AS relevance +FROM posts p JOIN threads ON threads.thread_id = p.thread_id JOIN boards ON boards.board_id = threads.board_id, query +WHERE p.body_search_index @@ query.query +ORDER BY relevance +DESC +$$ LANGUAGE sql STABLE; + +GRANT EXECUTE ON FUNCTION search_posts TO chan_archive_anon; +GRANT EXECUTE ON FUNCTION search_posts TO chan_archiver; +REVOKE EXECUTE ON FUNCTION search_posts FROM PUBLIC; + +ROLLBACK; diff --git a/sql/initialize.sql b/sql/initialize.sql index c0def53..bf58149 100644 --- a/sql/initialize.sql +++ b/sql/initialize.sql @@ -320,13 +320,23 @@ $$ LANGUAGE sql; -- Example: -- -- To search for posts related to 'quantum computing': -- SELECT * FROM search_posts('quantum computing'); -CREATE OR REPLACE FUNCTION search_posts(search_text TEXT) -RETURNS SETOF posts -AS $$ -SELECT p.* -FROM posts p -WHERE p.body_search_index @@ websearch_to_tsquery('english', search_text) -ORDER BY ts_rank(p.body_search_index, websearch_to_tsquery('english', search_text)) DESC; +CREATE OR REPLACE FUNCTION search_posts(search_text text) +RETURNS TABLE ( + post posts, + pathpart text, + board_thread_id bigint, + relevance double precision +) AS $$ +WITH query AS ( + SELECT websearch_to_tsquery('english', search_text) AS query +) +SELECT p::posts AS post, pathpart, board_thread_id, + ts_rank(p.body_search_index, query.query) + / (1 + EXTRACT(EPOCH FROM AGE(p.creation_time)) / (3600 * 24)) AS relevance +FROM posts p JOIN threads ON threads.thread_id = p.thread_id JOIN boards ON boards.board_id = threads.board_id, query +WHERE p.body_search_index @@ query.query +ORDER BY relevance +DESC $$ LANGUAGE sql STABLE; diff --git a/sql/recreate_attachments_table.sql b/sql/recreate_attachments_table.sql new file mode 100644 index 0000000..12bfe06 --- /dev/null +++ b/sql/recreate_attachments_table.sql @@ -0,0 +1,40 @@ +BEGIN TRANSACTION; + +DROP TYPE IF EXISTS dimension CASCADE; +DROP TABLE IF EXISTS attachments CASCADE; + +CREATE TYPE dimension AS + ( width int + , height int + ); + +CREATE TABLE IF NOT EXISTS attachments + ( attachment_id bigserial primary key + , mimetype text NOT NULL + , creation_time timestamp with time zone NOT NULL + , sha256_hash text NOT NULL + , phash bigint + , illegal boolean NOT NULL DEFAULT false + , post_id bigint NOT NULL + , resolution dimension + , file_extension text + , thumb_extension text + , original_filename text + , board_filename text NOT NULL + , spoiler boolean NOT NULL DEFAULT true + , file_size_bytes int + , CONSTRAINT post_fk FOREIGN KEY (post_id) REFERENCES posts (post_id) ON DELETE CASCADE + ); +CREATE INDEX attachments_creation_time_idx ON attachments (creation_time); +CREATE INDEX attachments_post_id_idx ON attachments (post_id); +CREATE INDEX attachments_sha256_hash_idx ON attachments (sha256_hash); +-- +-- Index using the bktree extension for quickly getting the closest phashes +CREATE INDEX attachments_phash_bktree_index ON attachments USING spgist (phash bktree_ops); + + +GRANT SELECT ON attachments TO chan_archive_anon; +GRANT ALL ON attachments TO chan_archiver; +GRANT usage, select ON SEQUENCE attachments_attachment_id_seq TO chan_archiver; + +COMMIT; diff --git a/sql/remake_fetch_catalog.sql b/sql/remake_fetch_catalog.sql new file mode 100644 index 0000000..bd3fecd --- /dev/null +++ b/sql/remake_fetch_catalog.sql @@ -0,0 +1,87 @@ +BEGIN TRANSACTION; + +DROP FUNCTION IF EXISTS fetch_catalog; + +CREATE OR REPLACE FUNCTION fetch_catalog(max_time timestamptz, max_row_read int DEFAULT 10000) +RETURNS TABLE ( + -- post_count bigint, + estimated_post_count bigint, + post_id bigint, + board_post_id bigint, + creation_time timestamptz, + bump_time timestamptz, + body text, + subject text, + thread_id bigint, + board_thread_id bigint, + pathpart text, + site_name text, + file_mimetype text, + file_illegal boolean, + -- file_resolution dimension, + file_name text, + file_extension text, + file_thumb_extension text +) AS $$ + WITH + top AS + ( + SELECT * FROM fetch_top_threads(max_time, max_row_read) AS top + ), + tall_posts AS + ( + SELECT + top.post_count as estimated_post_count, + posts.post_id, + posts.board_post_id, + posts.creation_time, + top.bump_time, + posts.body, + posts.subject, + posts.thread_id + FROM top + JOIN posts ON top.thread_id = posts.thread_id + WHERE creation_time < max_time + ), + op_posts AS + ( + SELECT DISTINCT ON (t.thread_id) + * + FROM tall_posts t + ORDER BY t.thread_id, t.board_post_id + )-- , + -- post_counts AS + -- ( + -- SELECT thread_id, count(*) as post_count FROM + -- tall_posts + -- GROUP BY thread_id + -- ) + SELECT + -- post_counts.post_count, + op_posts.*, + threads.board_thread_id, -- this should be part of the url path when creating links, not thread_id (that's internal) + boards.pathpart, + sites."name", + -- sites.site_id, + attachments.mimetype as file_mimetype, + attachments.illegal as file_illegal, + -- attachments.resolution as file_resolution, + attachments.board_filename as file_name, + attachments.file_extension, + attachments.thumb_extension as file_thumb_extension + FROM op_posts + -- JOIN post_counts ON op_posts.thread_id = post_counts.thread_id + JOIN threads ON op_posts.thread_id = threads.thread_id + JOIN boards ON threads.board_id = boards.board_id + JOIN sites ON sites.site_id = boards.site_id + LEFT OUTER JOIN attachments ON attachments.post_id = op_posts.post_id + ORDER BY bump_time DESC; +$$ LANGUAGE sql; + + +REVOKE EXECUTE ON FUNCTION fetch_catalog FROM PUBLIC; +GRANT EXECUTE ON FUNCTION fetch_catalog TO chan_archive_anon; +GRANT EXECUTE ON FUNCTION fetch_catalog TO chan_archiver; + +-- ROLLBACK; +COMMIT;