sql: improve search_posts function

This commit is contained in:
towards-a-new-leftypol 2024-02-19 14:51:56 -05:00
parent 96546a8f7e
commit 9e3c054c53
6 changed files with 192 additions and 15 deletions

View File

@ -592,12 +592,15 @@ SELECT * FROM posts WHERE body_search_index @@ websearch_to_tsquery('english', '
WITH query AS (
SELECT websearch_to_tsquery('english', 'Anonymous') AS query
SELECT websearch_to_tsquery('english', 'sound cringe irl') AS query
)
SELECT p.*, ts_rank(p.body_search_index, query.query) AS relevance
FROM posts p, query
SELECT p.*, pathpart, board_thread_id,
ts_rank(p.body_search_index, query.query)
/ (1 + EXTRACT(EPOCH FROM AGE(p.creation_time)) / (3600 * 24)) AS relevance
FROM posts p JOIN threads ON threads.thread_id = p.thread_id JOIN boards ON boards.board_id = threads.board_id, query
WHERE p.body_search_index @@ query.query
ORDER BY relevance DESC;
ORDER BY relevance
DESC;
CREATE OR REPLACE FUNCTION search_posts(search_text TEXT)
@ -606,11 +609,15 @@ AS $$
SELECT p.*
FROM posts p
WHERE p.body_search_index @@ websearch_to_tsquery('english', search_text)
ORDER BY ts_rank(p.body_search_index, websearch_to_tsquery('english', search_text)) DESC;
ORDER BY
ts_rank(p.body_search_index, websearch_to_tsquery('english', search_text))
/
(1 + EXTRACT(EPOCH FROM AGE(p.creation_time)) / (3600 * 24))
DESC;
$$ LANGUAGE sql STABLE;
SELECT * FROM search_posts('alt chans') s
SELECT * FROM search_posts('archive') s
JOIN threads ON threads.thread_id = s.thread_id
JOIN boards ON boards.board_id = threads.board_id;
@ -641,3 +648,8 @@ SET body_search_index = (
)
WHERE board_post_id = 476524;
SELECT count(*)
FROM posts
WHERE creation_time >= CURRENT_DATE AT TIME ZONE 'UTC'
AND creation_time < (CURRENT_DATE + INTERVAL '1 day') AT TIME ZONE 'UTC';

View File

@ -23,7 +23,7 @@ AS $$
$$;
SELECT * FROM fetch_top_threads(NOW() - INTERVAL '10d');
SELECT * FROM fetch_top_threads(NOW(), 1000);
SELECT * FROM fetch_top_threads(NOW() - INTERVAL '10d') top JOIN posts ON top.thread_id = posts.thread_id;
@ -197,7 +197,7 @@ $$ LANGUAGE sql;
SELECT * FROM fetch_catalog(NOW() - INTERVAL '1y', 1001);
SELECT * FROM fetch_catalog(NOW(), 5000);
SELECT * FROM fetch_catalog(NOW(), 1000);
SELECT count(*) FROM posts;

28
sql/fix_search_posts.sql Normal file
View File

@ -0,0 +1,28 @@
BEGIN TRANSACTION;
DROP FUNCTION IF EXISTS search_posts;
CREATE OR REPLACE FUNCTION search_posts(search_text text)
RETURNS TABLE (
post posts,
pathpart text,
board_thread_id bigint,
relevance double precision
) AS $$
WITH query AS (
SELECT websearch_to_tsquery('english', search_text) AS query
)
SELECT p::posts AS post, pathpart, board_thread_id,
ts_rank(p.body_search_index, query.query)
/ (1 + EXTRACT(EPOCH FROM AGE(p.creation_time)) / (3600 * 24)) AS relevance
FROM posts p JOIN threads ON threads.thread_id = p.thread_id JOIN boards ON boards.board_id = threads.board_id, query
WHERE p.body_search_index @@ query.query
ORDER BY relevance
DESC
$$ LANGUAGE sql STABLE;
GRANT EXECUTE ON FUNCTION search_posts TO chan_archive_anon;
GRANT EXECUTE ON FUNCTION search_posts TO chan_archiver;
REVOKE EXECUTE ON FUNCTION search_posts FROM PUBLIC;
ROLLBACK;

View File

@ -320,13 +320,23 @@ $$ LANGUAGE sql;
-- Example:
-- -- To search for posts related to 'quantum computing':
-- SELECT * FROM search_posts('quantum computing');
CREATE OR REPLACE FUNCTION search_posts(search_text TEXT)
RETURNS SETOF posts
AS $$
SELECT p.*
FROM posts p
WHERE p.body_search_index @@ websearch_to_tsquery('english', search_text)
ORDER BY ts_rank(p.body_search_index, websearch_to_tsquery('english', search_text)) DESC;
CREATE OR REPLACE FUNCTION search_posts(search_text text)
RETURNS TABLE (
post posts,
pathpart text,
board_thread_id bigint,
relevance double precision
) AS $$
WITH query AS (
SELECT websearch_to_tsquery('english', search_text) AS query
)
SELECT p::posts AS post, pathpart, board_thread_id,
ts_rank(p.body_search_index, query.query)
/ (1 + EXTRACT(EPOCH FROM AGE(p.creation_time)) / (3600 * 24)) AS relevance
FROM posts p JOIN threads ON threads.thread_id = p.thread_id JOIN boards ON boards.board_id = threads.board_id, query
WHERE p.body_search_index @@ query.query
ORDER BY relevance
DESC
$$ LANGUAGE sql STABLE;

View File

@ -0,0 +1,40 @@
BEGIN TRANSACTION;
DROP TYPE IF EXISTS dimension CASCADE;
DROP TABLE IF EXISTS attachments CASCADE;
CREATE TYPE dimension AS
( width int
, height int
);
CREATE TABLE IF NOT EXISTS attachments
( attachment_id bigserial primary key
, mimetype text NOT NULL
, creation_time timestamp with time zone NOT NULL
, sha256_hash text NOT NULL
, phash bigint
, illegal boolean NOT NULL DEFAULT false
, post_id bigint NOT NULL
, resolution dimension
, file_extension text
, thumb_extension text
, original_filename text
, board_filename text NOT NULL
, spoiler boolean NOT NULL DEFAULT true
, file_size_bytes int
, CONSTRAINT post_fk FOREIGN KEY (post_id) REFERENCES posts (post_id) ON DELETE CASCADE
);
CREATE INDEX attachments_creation_time_idx ON attachments (creation_time);
CREATE INDEX attachments_post_id_idx ON attachments (post_id);
CREATE INDEX attachments_sha256_hash_idx ON attachments (sha256_hash);
--
-- Index using the bktree extension for quickly getting the closest phashes
CREATE INDEX attachments_phash_bktree_index ON attachments USING spgist (phash bktree_ops);
GRANT SELECT ON attachments TO chan_archive_anon;
GRANT ALL ON attachments TO chan_archiver;
GRANT usage, select ON SEQUENCE attachments_attachment_id_seq TO chan_archiver;
COMMIT;

View File

@ -0,0 +1,87 @@
BEGIN TRANSACTION;
DROP FUNCTION IF EXISTS fetch_catalog;
CREATE OR REPLACE FUNCTION fetch_catalog(max_time timestamptz, max_row_read int DEFAULT 10000)
RETURNS TABLE (
-- post_count bigint,
estimated_post_count bigint,
post_id bigint,
board_post_id bigint,
creation_time timestamptz,
bump_time timestamptz,
body text,
subject text,
thread_id bigint,
board_thread_id bigint,
pathpart text,
site_name text,
file_mimetype text,
file_illegal boolean,
-- file_resolution dimension,
file_name text,
file_extension text,
file_thumb_extension text
) AS $$
WITH
top AS
(
SELECT * FROM fetch_top_threads(max_time, max_row_read) AS top
),
tall_posts AS
(
SELECT
top.post_count as estimated_post_count,
posts.post_id,
posts.board_post_id,
posts.creation_time,
top.bump_time,
posts.body,
posts.subject,
posts.thread_id
FROM top
JOIN posts ON top.thread_id = posts.thread_id
WHERE creation_time < max_time
),
op_posts AS
(
SELECT DISTINCT ON (t.thread_id)
*
FROM tall_posts t
ORDER BY t.thread_id, t.board_post_id
)-- ,
-- post_counts AS
-- (
-- SELECT thread_id, count(*) as post_count FROM
-- tall_posts
-- GROUP BY thread_id
-- )
SELECT
-- post_counts.post_count,
op_posts.*,
threads.board_thread_id, -- this should be part of the url path when creating links, not thread_id (that's internal)
boards.pathpart,
sites."name",
-- sites.site_id,
attachments.mimetype as file_mimetype,
attachments.illegal as file_illegal,
-- attachments.resolution as file_resolution,
attachments.board_filename as file_name,
attachments.file_extension,
attachments.thumb_extension as file_thumb_extension
FROM op_posts
-- JOIN post_counts ON op_posts.thread_id = post_counts.thread_id
JOIN threads ON op_posts.thread_id = threads.thread_id
JOIN boards ON threads.board_id = boards.board_id
JOIN sites ON sites.site_id = boards.site_id
LEFT OUTER JOIN attachments ON attachments.post_id = op_posts.post_id
ORDER BY bump_time DESC;
$$ LANGUAGE sql;
REVOKE EXECUTE ON FUNCTION fetch_catalog FROM PUBLIC;
GRANT EXECUTE ON FUNCTION fetch_catalog TO chan_archive_anon;
GRANT EXECUTE ON FUNCTION fetch_catalog TO chan_archiver;
-- ROLLBACK;
COMMIT;