408 lines
16 KiB
PL/PgSQL
408 lines
16 KiB
PL/PgSQL
BEGIN TRANSACTION;
|
|
|
|
-- <bktree/sql/init.sql>
|
|
-- CREATE EXTENSION bktree; -- only superuser can
|
|
|
|
-- Check whether any of our opclasses fail amvalidate
|
|
SELECT amname, opcname
|
|
FROM pg_opclass opc LEFT JOIN pg_am am ON am.oid = opcmethod
|
|
WHERE opc.oid >= 16384 AND NOT amvalidate(opc.oid);
|
|
|
|
-- </bktree/sql/init.sql>
|
|
|
|
DROP TYPE IF EXISTS dimension CASCADE;
|
|
DROP TABLE IF EXISTS sites CASCADE;
|
|
DROP TABLE IF EXISTS boards CASCADE;
|
|
DROP TABLE IF EXISTS threads CASCADE;
|
|
DROP TABLE IF EXISTS posts CASCADE;
|
|
DROP TABLE IF EXISTS attachments CASCADE;
|
|
DROP TYPE IF EXISTS catalog_grid_result CASCADE;
|
|
DROP TYPE IF EXISTS post_key CASCADE;
|
|
DROP FUNCTION IF EXISTS update_post_body_search_index;
|
|
DROP FUNCTION IF EXISTS fetch_top_threads;
|
|
DROP FUNCTION IF EXISTS fetch_catalog;
|
|
|
|
|
|
-- It won't let us drop roles otherwise and the IFs are to keep this script idempotent.
|
|
DO
|
|
$$BEGIN
|
|
IF EXISTS (SELECT FROM pg_roles WHERE rolname = 'chan_archiver') THEN
|
|
EXECUTE 'REVOKE ALL PRIVILEGES ON DATABASE chan_archives FROM chan_archiver';
|
|
END IF;
|
|
IF EXISTS (SELECT FROM pg_roles WHERE rolname = 'chan_archive_anon') THEN
|
|
EXECUTE 'REVOKE ALL PRIVILEGES ON DATABASE chan_archives FROM chan_archive_anon';
|
|
END IF;
|
|
END$$;
|
|
|
|
DROP ROLE IF EXISTS chan_archiver;
|
|
DROP ROLE IF EXISTS chan_archive_anon;
|
|
|
|
CREATE TABLE IF NOT EXISTS sites
|
|
( site_id serial primary key
|
|
, name text NOT NULL UNIQUE
|
|
, url text NOT NULL
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS boards
|
|
( board_id serial primary key
|
|
, name text
|
|
, pathpart text NOT NULL -- if it's /a/ then the pathpart is a
|
|
, site_id int NOT NULL
|
|
, CONSTRAINT site_fk FOREIGN KEY (site_id) REFERENCES sites (site_id) ON DELETE CASCADE
|
|
, CONSTRAINT unique_site_board_id_constraint UNIQUE (site_id, pathpart)
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS threads
|
|
( thread_id bigserial primary key
|
|
, board_thread_id bigint NOT NULL -- this is the id of the thread in lainchan, mysql
|
|
, creation_time timestamp with time zone NOT NULL
|
|
, board_id int NOT NULL
|
|
, CONSTRAINT board_fk FOREIGN KEY (board_id) REFERENCES boards (board_id) ON DELETE CASCADE
|
|
, CONSTRAINT unique_board_board_thread_id_constraint UNIQUE (board_id, board_thread_id)
|
|
);
|
|
CREATE INDEX threads_creation_time_idx ON threads (creation_time);
|
|
CREATE INDEX threads_board_id_idx ON threads (board_id);
|
|
CREATE INDEX threads_board_thread_id_idx ON threads (board_thread_id);
|
|
|
|
CREATE TABLE IF NOT EXISTS posts
|
|
( post_id bigserial primary key
|
|
, board_post_id bigint NOT NULL
|
|
, creation_time timestamp with time zone NOT NULL
|
|
, body text
|
|
, subject text
|
|
, name text
|
|
, email text
|
|
, body_search_index tsvector
|
|
, thread_id bigint NOT NULL
|
|
, embed text
|
|
, local_idx int NOT NULL
|
|
, CONSTRAINT unique_thread_board_id_constraint UNIQUE (thread_id, board_post_id)
|
|
, CONSTRAINT thread_fk FOREIGN KEY (thread_id) REFERENCES threads (thread_id) ON DELETE CASCADE
|
|
, CONSTRAINT unique_thread_local_idx UNIQUE (thread_id, local_idx)
|
|
);
|
|
CREATE INDEX posts_creation_time_idx ON posts (creation_time);
|
|
CREATE INDEX posts_body_search_idx ON posts USING GIN (body_search_index);
|
|
CREATE INDEX posts_thread_id_idx ON posts (thread_id);
|
|
CREATE INDEX posts_board_post_id_idx ON posts (board_post_id);
|
|
CREATE INDEX posts_thread_id_creation_time_idx ON posts (creation_time, thread_id);
|
|
CREATE INDEX posts_local_idx_idx ON posts (local_idx);
|
|
--CREATE INDEX posts_thread_id_board_post_id_idx ON posts (thread_id, board_post_id);
|
|
|
|
CREATE OR REPLACE FUNCTION update_post_body_search_index() RETURNS trigger AS $$
|
|
BEGIN
|
|
NEW.body_search_index :=
|
|
(
|
|
setweight(to_tsvector('english', COALESCE(NEW.subject, '')), 'A') ||
|
|
setweight(to_tsvector('english', COALESCE(NEW.name, '')), 'B') ||
|
|
setweight(to_tsvector('english', COALESCE(NEW.body, '')), 'C')
|
|
);
|
|
RETURN NEW;
|
|
END;
|
|
$$ LANGUAGE plpgsql;
|
|
|
|
-- WARNING: maybe disable this before doing full table operations on the posts table,
|
|
-- like populating a new column, since this will cause it to rebuild the entire text search index
|
|
CREATE TRIGGER trigger_update_post_body_search_index
|
|
BEFORE INSERT OR UPDATE
|
|
ON posts
|
|
FOR EACH ROW
|
|
EXECUTE FUNCTION update_post_body_search_index();
|
|
|
|
CREATE TYPE dimension AS
|
|
( width int
|
|
, height int
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS attachments
|
|
( attachment_id bigserial primary key
|
|
, mimetype text NOT NULL
|
|
, creation_time timestamp with time zone NOT NULL
|
|
, sha256_hash text NOT NULL
|
|
, phash bigint
|
|
, illegal boolean NOT NULL DEFAULT false
|
|
, post_id bigint NOT NULL
|
|
, resolution dimension
|
|
, file_extension text
|
|
, thumb_extension text
|
|
, original_filename text
|
|
, board_filename text NOT NULL
|
|
, spoiler boolean NOT NULL DEFAULT true
|
|
, file_size_bytes int
|
|
, attachment_idx int NOT NULL
|
|
, CONSTRAINT post_fk FOREIGN KEY (post_id) REFERENCES posts (post_id) ON DELETE CASCADE
|
|
, CONSTRAINT unique_post_attachment_idx UNIQUE (post_id, attachment_idx)
|
|
);
|
|
CREATE INDEX attachments_creation_time_idx ON attachments (creation_time);
|
|
CREATE INDEX attachments_post_id_idx ON attachments (post_id);
|
|
CREATE INDEX attachments_sha256_hash_idx ON attachments (sha256_hash);
|
|
CREATE INDEX attachments_attachment_idx_idx ON attachments (attachment_idx);
|
|
|
|
|
|
-- Index using the bktree extension for quickly getting the closest phashes
|
|
CREATE INDEX attachments_phash_bktree_index ON attachments USING spgist (phash bktree_ops);
|
|
|
|
|
|
/*
|
|
* Function Definitions
|
|
*/
|
|
|
|
/*
|
|
CREATE OR REPLACE FUNCTION insert_posts_and_return_ids(new_posts posts[])
|
|
RETURNS TABLE (post_id bigint, board_post_id bigint) AS $$
|
|
WITH inserted AS (
|
|
INSERT INTO posts (board_post_id, creation_time, body, thread_id)
|
|
SELECT np.board_post_id, np.creation_time, np.body, np.thread_id
|
|
FROM unnest(new_posts) AS np
|
|
ON CONFLICT (thread_id, board_post_id) DO NOTHING
|
|
RETURNING post_id, board_post_id
|
|
),
|
|
selected AS (
|
|
SELECT post_id, board_post_id
|
|
FROM posts
|
|
WHERE (thread_id, board_post_id) IN (SELECT thread_id, board_post_id FROM unnest(new_posts))
|
|
)
|
|
SELECT * FROM inserted
|
|
UNION ALL
|
|
SELECT * FROM selected WHERE (post_id, board_post_id) NOT IN (SELECT post_id, board_post_id FROM inserted);
|
|
$$ LANGUAGE sql;
|
|
|
|
-- 3m37s for clean db
|
|
-- 1m34s for full db (nothing inserted)
|
|
|
|
*/
|
|
|
|
|
|
-- Deprecated: this doesn't insert local_idx, seems better to just get all the posts in application
|
|
CREATE OR REPLACE FUNCTION insert_posts_and_return_ids(new_posts posts[])
|
|
RETURNS TABLE (post_id bigint, board_post_id bigint, thread_id bigint) AS $$
|
|
WITH
|
|
selected AS (
|
|
SELECT post_id, board_post_id, thread_id
|
|
FROM posts
|
|
WHERE (thread_id, board_post_id) IN (SELECT thread_id, board_post_id FROM unnest(new_posts))
|
|
),
|
|
to_insert AS (
|
|
SELECT np.*
|
|
FROM unnest(new_posts) AS np
|
|
LEFT OUTER JOIN selected s ON np.thread_id = s.thread_id AND np.board_post_id = s.board_post_id
|
|
WHERE s.post_id IS NULL
|
|
),
|
|
inserted AS (
|
|
INSERT INTO posts (board_post_id, creation_time, body, subject, name, email, thread_id)
|
|
SELECT board_post_id, creation_time, body, subject, name, email, thread_id
|
|
FROM to_insert
|
|
RETURNING post_id, board_post_id, thread_id
|
|
)
|
|
SELECT * FROM inserted
|
|
UNION ALL
|
|
SELECT * FROM selected;
|
|
$$ LANGUAGE sql;
|
|
|
|
-- 1:51 for clean db (this varies a lot)
|
|
-- 1:21 for full db (nothing inserted)
|
|
|
|
|
|
CREATE TYPE post_key AS
|
|
( thread_id bigint
|
|
, board_post_id bigint
|
|
);
|
|
|
|
CREATE OR REPLACE FUNCTION get_posts(board_posts post_key[])
|
|
RETURNS SETOF posts AS $$
|
|
|
|
SELECT *
|
|
FROM posts
|
|
WHERE (thread_id, board_post_id) IN (SELECT thread_id, board_post_id FROM unnest(board_posts))
|
|
|
|
$$ LANGUAGE sql;
|
|
|
|
|
|
CREATE OR REPLACE FUNCTION fetch_top_threads(
|
|
p_start_time TIMESTAMPTZ,
|
|
lookback INT DEFAULT 10000
|
|
)
|
|
RETURNS TABLE(bump_time TIMESTAMPTZ, post_count BIGINT, thread_id BIGINT, where_to_leave_off TIMESTAMPTZ)
|
|
LANGUAGE sql
|
|
AS $$
|
|
SELECT
|
|
max(creation_time) as bump_time,
|
|
count(*),
|
|
thread_id,
|
|
min(creation_time) as where_to_leave_off
|
|
FROM
|
|
(
|
|
SELECT thread_id, creation_time
|
|
FROM posts
|
|
WHERE creation_time < p_start_time
|
|
ORDER BY creation_time DESC
|
|
LIMIT LEAST(lookback, 250000) -- capping the lookback to 250k
|
|
) as t
|
|
GROUP BY thread_id
|
|
ORDER BY bump_time DESC;
|
|
$$;
|
|
|
|
|
|
CREATE TYPE catalog_grid_result AS
|
|
(
|
|
-- post_count bigint,
|
|
estimated_post_count bigint,
|
|
post_id bigint,
|
|
board_post_id bigint,
|
|
creation_time timestamptz,
|
|
bump_time timestamptz,
|
|
body text,
|
|
subject text,
|
|
thread_id bigint,
|
|
embed text,
|
|
board_thread_id bigint,
|
|
pathpart text,
|
|
site_name text,
|
|
file_mimetype text,
|
|
file_illegal boolean,
|
|
-- file_resolution dimension,
|
|
file_name text,
|
|
file_extension text,
|
|
file_thumb_extension text
|
|
);
|
|
|
|
|
|
-- Function: search_posts
|
|
--
|
|
-- This function performs a full-text search on the `posts` table using PostgreSQL's text search features.
|
|
-- It takes a single argument `search_text`, which represents the text to search for within the `posts` body content.
|
|
-- The function is designed to join the `posts` table with `threads`, `boards`, `sites`, and `attachments` tables to enrich
|
|
-- the search results with additional context such as thread information, board pathpart, site name, and attachment details.
|
|
-- Results are ranked by their relevance to the search text, with the most relevant posts appearing first.
|
|
--
|
|
-- The function uses the `websearch_to_tsquery` function for parsing the provided search text into a tsquery object
|
|
-- using the 'english' configuration. It then ranks the results using the `ts_rank` function based on the match
|
|
-- between the `body_search_index` column and the tsquery object, adjusting the rank by the age of the post to prefer newer posts.
|
|
-- Results are ordered from most to least relevant based on this computed relevance.
|
|
--
|
|
-- Note that the `relevance` score is computed for ranking purposes but is not included in the function's return set.
|
|
-- The function returns a rich set of information for each matching post, including details from related tables to provide
|
|
-- a comprehensive view suitable for displaying a catalog grid of search results.
|
|
--
|
|
-- Parameters:
|
|
-- - search_text: TEXT, the text to be searched in the `posts` body.
|
|
--
|
|
-- Returns:
|
|
-- - A SETOF rows combining data from `posts`, `threads`, `boards`, `sites`, and `attachments` tables, ordered by
|
|
-- the relevance of the full-text search. Each row includes post details along with thread, board, site, and attachment
|
|
-- information to provide a full context for each search result.
|
|
--
|
|
-- Usage:
|
|
-- SELECT * FROM search_posts('Desired search text');
|
|
--
|
|
-- The function is marked as STABLE, indicating that it does not modify the database and always returns the same
|
|
-- results for the same input when the underlying data does not change.
|
|
--
|
|
-- Example:
|
|
-- -- To search for posts related to 'quantum computing':
|
|
-- SELECT * FROM search_posts('quantum computing');
|
|
|
|
CREATE OR REPLACE FUNCTION search_posts(search_text text, max_rows integer DEFAULT 1000)
|
|
RETURNS SETOF catalog_grid_result AS $$
|
|
WITH
|
|
query AS (
|
|
SELECT websearch_to_tsquery('english', search_text) AS query
|
|
),
|
|
result_set AS (
|
|
SELECT
|
|
p.*,
|
|
threads.board_thread_id,
|
|
pathpart,
|
|
sites.name AS site_name,
|
|
attachments.mimetype as file_mimetype,
|
|
attachments.illegal as file_illegal,
|
|
-- attachments.resolution as file_resolution,
|
|
attachments.board_filename as file_name,
|
|
attachments.file_extension,
|
|
attachments.thumb_extension as file_thumb_extension,
|
|
ts_rank(p.body_search_index, query.query)
|
|
/ (1 + EXTRACT(EPOCH FROM AGE(p.creation_time)) / (3600 * 24)) AS relevance
|
|
FROM posts p
|
|
JOIN threads ON threads.thread_id = p.thread_id
|
|
JOIN boards ON boards.board_id = threads.board_id
|
|
JOIN sites ON sites.site_id = boards.site_id
|
|
LEFT OUTER JOIN attachments
|
|
ON attachments.post_id = p.post_id
|
|
AND attachments.attachment_idx = 1
|
|
, query
|
|
WHERE p.body_search_index @@ query.query
|
|
LIMIT max_rows
|
|
)
|
|
SELECT
|
|
0 AS estimated_post_count,
|
|
result_set.post_id,
|
|
result_set.board_post_id,
|
|
result_set.creation_time,
|
|
result_set.creation_time AS bump_time,
|
|
result_set.body,
|
|
result_set.subject,
|
|
result_set.thread_id,
|
|
result_set.embed,
|
|
result_set.board_thread_id,
|
|
result_set.pathpart,
|
|
result_set.site_name,
|
|
result_set.file_mimetype,
|
|
result_set.file_illegal,
|
|
result_set.file_name,
|
|
result_set.file_extension,
|
|
result_set.file_thumb_extension
|
|
FROM result_set
|
|
ORDER BY result_set.relevance DESC;
|
|
$$ LANGUAGE sql STABLE;
|
|
|
|
|
|
/*
|
|
* Permissions
|
|
*/
|
|
REVOKE EXECUTE ON FUNCTION insert_posts_and_return_ids FROM PUBLIC;
|
|
REVOKE EXECUTE ON FUNCTION fetch_top_threads FROM PUBLIC;
|
|
REVOKE EXECUTE ON FUNCTION fetch_catalog FROM PUBLIC;
|
|
REVOKE EXECUTE ON FUNCTION search_posts FROM PUBLIC;
|
|
REVOKE EXECUTE ON FUNCTION update_post_body_search_index FROM PUBLIC;
|
|
REVOKE EXECUTE ON FUNCTION get_posts FROM PUBLIC;
|
|
|
|
CREATE ROLE chan_archive_anon nologin;
|
|
GRANT CONNECT ON DATABASE chan_archives TO chan_archive_anon;
|
|
GRANT SELECT ON sites TO chan_archive_anon;
|
|
GRANT SELECT ON boards TO chan_archive_anon;
|
|
GRANT SELECT ON threads TO chan_archive_anon;
|
|
GRANT SELECT ON posts TO chan_archive_anon;
|
|
GRANT SELECT ON attachments TO chan_archive_anon;
|
|
GRANT EXECUTE ON FUNCTION fetch_catalog TO chan_archive_anon;
|
|
GRANT EXECUTE ON FUNCTION fetch_top_threads TO chan_archive_anon;
|
|
GRANT EXECUTE ON FUNCTION search_posts TO chan_archive_anon;
|
|
GRANT EXECUTE ON FUNCTION get_posts TO chan_archive_anon;
|
|
|
|
-- GRANT usage, select ON SEQUENCE sites_site_id_seq TO chan_archive_anon;
|
|
-- GRANT usage, select ON SEQUENCE boards_board_id_seq TO chan_archive_anon;
|
|
GRANT chan_archive_anon TO admin;
|
|
|
|
CREATE ROLE chan_archiver noinherit login password 'test_password'
|
|
SET pgrst.db_aggregates_enabled = 'true';
|
|
GRANT CONNECT ON DATABASE chan_archives TO chan_archiver;
|
|
GRANT chan_archive_anon TO chan_archiver;
|
|
GRANT ALL ON sites TO chan_archiver;
|
|
GRANT ALL ON boards TO chan_archiver;
|
|
GRANT ALL ON threads TO chan_archiver;
|
|
GRANT ALL ON posts TO chan_archiver;
|
|
GRANT ALL ON attachments TO chan_archiver;
|
|
GRANT EXECUTE ON FUNCTION update_post_body_search_index TO chan_archiver;
|
|
GRANT EXECUTE ON FUNCTION insert_posts_and_return_ids TO chan_archiver;
|
|
GRANT EXECUTE ON FUNCTION fetch_top_threads TO chan_archiver;
|
|
GRANT EXECUTE ON FUNCTION fetch_catalog TO chan_archiver;
|
|
GRANT EXECUTE ON FUNCTION search_posts TO chan_archiver;
|
|
GRANT EXECUTE ON FUNCTION get_posts TO chan_archiver;
|
|
GRANT usage, select ON SEQUENCE sites_site_id_seq TO chan_archiver;
|
|
GRANT usage, select ON SEQUENCE boards_board_id_seq TO chan_archiver;
|
|
GRANT usage, select ON SEQUENCE threads_thread_id_seq TO chan_archiver;
|
|
GRANT usage, select ON SEQUENCE posts_post_id_seq TO chan_archiver;
|
|
GRANT usage, select ON SEQUENCE attachments_attachment_id_seq TO chan_archiver;
|
|
|
|
GRANT chan_archiver TO admin;
|
|
|
|
COMMIT;
|