Compare commits

...

3 Commits

Author SHA1 Message Date
towards-a-new-leftypol 1113539321 Sync starts by getting latest post for each board 2025-01-30 18:00:42 -05:00
towards-a-new-leftypol 518467c7eb Add sql function to get the last post on each board 2025-01-29 22:50:28 -05:00
towards-a-new-leftypol 7bf61c0dd2 fetch_catalog should be in sql/initialize.sql
- copy it from the experimental remake_fetch_catalog.sql script
2025-01-29 18:27:57 -05:00
9 changed files with 252 additions and 36 deletions

View File

@ -80,6 +80,7 @@ executable chan-delorean
Data.WordUtil Data.WordUtil
Network.DataClient Network.DataClient
Network.DataClientTypes Network.DataClientTypes
Network.GetLatestPostsPerBoardResponse
Common.Server.JSONSettings Common.Server.JSONSettings
Common.Server.ConsumerSettings Common.Server.ConsumerSettings

View File

@ -1,13 +1,17 @@
{ {
"websites": { "websites": [
{
"name": "example", "name": "example",
"root_url": "https://example.net", "root_url": "https://example.net",
"boards": [ "boards": [
"tech", "tech",
"meta" "meta"
] ]
}, }
],
"postgrest_url": "http://localhost:3000", "postgrest_url": "http://localhost:3000",
"jwt": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJyb2xlIjoiY2hhbl9hcmNoaXZlciJ9.rGIKZokTDKTuQLIv8138bUby5PELfDipYYIDpJzH02c", "jwt": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJyb2xlIjoiY2hhbl9hcmNoaXZlciJ9.rGIKZokTDKTuQLIv8138bUby5PELfDipYYIDpJzH02c",
"media_root_path": "/home/phil/linixy/tmp/chan_archive_media2/archive" "media_root_path": "/home/phil/linixy/tmp/chan_archive_media_repaired/archive",
"http_fill_all": false,
"http_sync_continously": true
} }

View File

@ -222,3 +222,99 @@ SELECT * FROM boards;
SELECT * FROM threads WHERE thread_id = 11314; SELECT * FROM threads WHERE thread_id = 11314;
ANALYZE posts; ANALYZE posts;
SELECT count(*) from attachments;
SELECT * FROM attachments WHERE post_id = 253383;
SELECT * from attachments WHERE board_filename = '1722466065515';
SELECT count(*) attachments WHERE attachment_id < (SELECT attachment_id FROM attachments WHERE board_filename = '1722466065515');
SELECT max(attachment_id) FROM attachments a;
SELECT pg_get_serial_sequence('attachments', 'attachment_id');
SELECT setval(pg_get_serial_sequence('attachments', 'attachment_id'), COALESCE(198853, 1), true);
UPDATE attachments SET thumb_extension = 'png'
WHERE
attachment_id IN
(
SELECT a.attachment_id
FROM attachments a
JOIN posts p ON a.post_id = p.post_id
JOIN threads t ON p.thread_id = t.thread_id
JOIN boards b ON t.board_id = b.board_id
JOIN sites s ON b.site_id = s.site_id
WHERE s.name = 'leftychan'
AND a.thumb_extension = 'jpg'
);
SELECT * FROM posts WHERE board_post_id = 129;
SELECT * FROM attachments WHERE post_id = 461287;
SELECT count(a.*)
FROM attachments a
JOIN posts p ON a.post_id = p.post_id
JOIN threads t ON p.thread_id = t.thread_id
JOIN boards b ON t.board_id = b.board_id
JOIN sites s ON b.site_id = s.site_id
WHERE s.name = 'leftychan'
AND a.thumb_extension = 'jpg';
SELECT * FROM posts
JOIN threads ON threads.thread_id = posts.thread_id
JOIN boards ON boards.board_id = threads.board_id
WHERE boards.pathpart = 'leftypol'
AND boards.site_id = 1
ORDER BY posts.creation_time DESC
LIMIT 1;
SELECT * FROM posts
ORDER BY posts.creation_time DESC
LIMIT 1;
SELECT boards.board_id, boards.pathpart, sites.name FROM boards JOIN sites ON sites.site_id = boards.site_id;
SELECT DISTINCT ON (b.board_id)
b.board_id,
b.site_id,
b.pathpart,
p.post_id,
p.board_post_id,
p.creation_time,
p.body,
t.thread_id,
t.board_thread_id
FROM boards b
JOIN threads t ON t.board_id = b.board_id
JOIN posts p ON p.thread_id = t.thread_id
ORDER BY b.board_id, p.creation_time DESC;
CREATE OR REPLACE FUNCTION get_latest_posts_per_board()
RETURNS TABLE (
board_id int,
site_id int,
pathpart text,
post_id bigint,
board_post_id bigint,
creation_time timestamp with time zone,
thread_id bigint,
board_thread_id bigint
) AS $$
SELECT DISTINCT ON (b.board_id)
b.board_id,
b.site_id,
b.pathpart,
p.post_id,
p.board_post_id,
p.creation_time,
t.thread_id,
t.board_thread_id
FROM boards b
JOIN threads t ON t.board_id = b.board_id
JOIN posts p ON p.thread_id = t.thread_id
ORDER BY b.board_id, p.creation_time DESC;
$$ LANGUAGE sql STABLE;
SELECT * FROM get_latest_posts_per_board();

View File

@ -21,6 +21,7 @@ DROP TYPE IF EXISTS post_key CASCADE;
DROP FUNCTION IF EXISTS update_post_body_search_index; DROP FUNCTION IF EXISTS update_post_body_search_index;
DROP FUNCTION IF EXISTS fetch_top_threads; DROP FUNCTION IF EXISTS fetch_top_threads;
DROP FUNCTION IF EXISTS fetch_catalog; DROP FUNCTION IF EXISTS fetch_catalog;
DROP FUNCTION IF EXISTS get_latest_posts_per_board;
-- It won't let us drop roles otherwise and the IFs are to keep this script idempotent. -- It won't let us drop roles otherwise and the IFs are to keep this script idempotent.
@ -222,7 +223,7 @@ CREATE OR REPLACE FUNCTION fetch_top_threads(
lookback INT DEFAULT 10000 lookback INT DEFAULT 10000
) )
RETURNS TABLE(bump_time TIMESTAMPTZ, post_count BIGINT, thread_id BIGINT, where_to_leave_off TIMESTAMPTZ) RETURNS TABLE(bump_time TIMESTAMPTZ, post_count BIGINT, thread_id BIGINT, where_to_leave_off TIMESTAMPTZ)
LANGUAGE sql LANGUAGE sql STABLE
AS $$ AS $$
SELECT SELECT
max(creation_time) as bump_time, max(creation_time) as bump_time,
@ -266,6 +267,51 @@ CREATE TYPE catalog_grid_result AS
); );
CREATE OR REPLACE FUNCTION fetch_catalog(max_time timestamptz, max_row_read int DEFAULT 10000)
RETURNS SETOF catalog_grid_result AS $$
WITH
top AS
(
SELECT * FROM fetch_top_threads(max_time, max_row_read) AS top
),
tall_posts AS
(
SELECT
top.post_count AS estimated_post_count,
posts.post_id,
posts.board_post_id,
posts.creation_time,
top.bump_time,
posts.body,
posts.subject,
posts.thread_id,
posts.embed
FROM top
JOIN posts ON top.thread_id = posts.thread_id AND posts.local_idx = 1
WHERE creation_time < max_time
)
SELECT
-- post_counts.post_count,
tall_posts.*,
threads.board_thread_id, -- this should be part of the url path when creating links, not thread_id (that's internal)
boards.pathpart,
sites."name",
-- sites.site_id,
attachments.mimetype AS file_mimetype,
attachments.illegal AS file_illegal,
-- attachments.resolution AS file_resolution,
attachments.board_filename AS file_name,
attachments.file_extension,
attachments.thumb_extension AS file_thumb_extension
FROM tall_posts
JOIN threads ON tall_posts.thread_id = threads.thread_id
JOIN boards ON threads.board_id = boards.board_id
JOIN sites ON sites.site_id = boards.site_id
LEFT OUTER JOIN attachments ON attachments.post_id = tall_posts.post_id AND attachments.attachment_idx = 1
ORDER BY bump_time DESC;
$$ LANGUAGE sql STABLE;
-- Function: search_posts -- Function: search_posts
-- --
-- This function performs a full-text search on the `posts` table using PostgreSQL's text search features. -- This function performs a full-text search on the `posts` table using PostgreSQL's text search features.
@ -355,6 +401,33 @@ RETURNS SETOF catalog_grid_result AS $$
$$ LANGUAGE sql STABLE; $$ LANGUAGE sql STABLE;
CREATE OR REPLACE FUNCTION get_latest_posts_per_board()
RETURNS TABLE (
board_id int,
site_id int,
pathpart text,
post_id bigint,
board_post_id bigint,
creation_time timestamp with time zone,
thread_id bigint,
board_thread_id bigint
) AS $$
SELECT DISTINCT ON (b.board_id)
b.board_id,
b.site_id,
b.pathpart,
p.post_id,
p.board_post_id,
p.creation_time,
t.thread_id,
t.board_thread_id
FROM boards b
JOIN threads t ON t.board_id = b.board_id
JOIN posts p ON p.thread_id = t.thread_id
ORDER BY b.board_id, p.creation_time DESC;
$$ LANGUAGE sql STABLE;
/* /*
* Permissions * Permissions
*/ */
@ -364,6 +437,7 @@ REVOKE EXECUTE ON FUNCTION fetch_catalog FROM PUBLIC;
REVOKE EXECUTE ON FUNCTION search_posts FROM PUBLIC; REVOKE EXECUTE ON FUNCTION search_posts FROM PUBLIC;
REVOKE EXECUTE ON FUNCTION update_post_body_search_index FROM PUBLIC; REVOKE EXECUTE ON FUNCTION update_post_body_search_index FROM PUBLIC;
REVOKE EXECUTE ON FUNCTION get_posts FROM PUBLIC; REVOKE EXECUTE ON FUNCTION get_posts FROM PUBLIC;
REVOKE EXECUTE ON FUNCTION get_latest_posts_per_board FROM PUBLIC;
CREATE ROLE chan_archive_anon nologin; CREATE ROLE chan_archive_anon nologin;
GRANT CONNECT ON DATABASE chan_archives TO chan_archive_anon; GRANT CONNECT ON DATABASE chan_archives TO chan_archive_anon;
@ -376,6 +450,7 @@ GRANT EXECUTE ON FUNCTION fetch_catalog TO chan_archive_anon;
GRANT EXECUTE ON FUNCTION fetch_top_threads TO chan_archive_anon; GRANT EXECUTE ON FUNCTION fetch_top_threads TO chan_archive_anon;
GRANT EXECUTE ON FUNCTION search_posts TO chan_archive_anon; GRANT EXECUTE ON FUNCTION search_posts TO chan_archive_anon;
GRANT EXECUTE ON FUNCTION get_posts TO chan_archive_anon; GRANT EXECUTE ON FUNCTION get_posts TO chan_archive_anon;
GRANT EXECUTE ON FUNCTION get_latest_posts_per_board TO chan_archive_anon;
-- GRANT usage, select ON SEQUENCE sites_site_id_seq TO chan_archive_anon; -- GRANT usage, select ON SEQUENCE sites_site_id_seq TO chan_archive_anon;
-- GRANT usage, select ON SEQUENCE boards_board_id_seq TO chan_archive_anon; -- GRANT usage, select ON SEQUENCE boards_board_id_seq TO chan_archive_anon;
@ -396,6 +471,7 @@ GRANT EXECUTE ON FUNCTION fetch_top_threads TO chan_archiver;
GRANT EXECUTE ON FUNCTION fetch_catalog TO chan_archiver; GRANT EXECUTE ON FUNCTION fetch_catalog TO chan_archiver;
GRANT EXECUTE ON FUNCTION search_posts TO chan_archiver; GRANT EXECUTE ON FUNCTION search_posts TO chan_archiver;
GRANT EXECUTE ON FUNCTION get_posts TO chan_archiver; GRANT EXECUTE ON FUNCTION get_posts TO chan_archiver;
GRANT EXECUTE ON FUNCTION get_latest_posts_per_board TO chan_archiver;
GRANT usage, select ON SEQUENCE sites_site_id_seq TO chan_archiver; GRANT usage, select ON SEQUENCE sites_site_id_seq TO chan_archiver;
GRANT usage, select ON SEQUENCE boards_board_id_seq TO chan_archiver; GRANT usage, select ON SEQUENCE boards_board_id_seq TO chan_archiver;
GRANT usage, select ON SEQUENCE threads_thread_id_seq TO chan_archiver; GRANT usage, select ON SEQUENCE threads_thread_id_seq TO chan_archiver;

View File

@ -1,5 +1,7 @@
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE DeriveAnyClass #-} {-# LANGUAGE DeriveAnyClass #-}
{-# OPTIONS_GHC -Wno-unrecognised-pragmas #-}
{-# HLINT ignore "Use <&>" #-}
module Network.DataClient module Network.DataClient
( HttpError(..) ( HttpError(..)
@ -19,6 +21,7 @@ module Network.DataClient
, postAttachments , postAttachments
, getJSON , getJSON
, getFile , getFile
, getLatestPostsPerBoard
) where ) where
import Control.Monad (forM) import Control.Monad (forM)
@ -49,6 +52,7 @@ import qualified Common.AttachmentType as Attachments
import qualified Common.PostsType as Posts import qualified Common.PostsType as Posts
import Common.Network.HttpClient import Common.Network.HttpClient
import qualified Network.DataClientTypes as T import qualified Network.DataClientTypes as T
import qualified Network.GetLatestPostsPerBoardResponse as GLPPBR
data PostId = PostId data PostId = PostId
@ -233,14 +237,20 @@ getFile url = do
case result of case result of
Left (err :: HttpError) -> do Left (err :: HttpError) -> do
putStrLn $ "getFile " ++ url ++ " Error!" putStrLn $ "getFile " ++ url ++ " Error!"
putStrLn $ show err print err
return Nothing return Nothing
Right lbs -> do Right lbs -> do
putStrLn $ "getFile " ++ url ++ " SUCCESS!" putStrLn $ "getFile " ++ url ++ " SUCCESS!"
tmp_root <- getCanonicalTemporaryDirectory tmp_root <- getCanonicalTemporaryDirectory
(tmp_filepath, tmp_filehandle) <- openBinaryTempFile tmp_root "chan.attachment" (tmp_filepath, tmp_filehandle) <- openBinaryTempFile tmp_root "chan.attachment"
putStrLn $ "Created " ++ tmp_filepath putStrLn $ "Created " ++ tmp_filepath
putStrLn $ "Writing attachment..." putStrLn "Writing attachment..."
LBS.hPut tmp_filehandle lbs LBS.hPut tmp_filehandle lbs
hClose tmp_filehandle hClose tmp_filehandle
return $ Just tmp_filepath return $ Just tmp_filepath
-- | Function to handle each chunk.
getLatestPostsPerBoard :: T.JSONSettings -> IO (Either HttpError [ GLPPBR.GetLatestPostsPerBoardResponse ])
getLatestPostsPerBoard settings =
post settings "/rpc/get_latest_posts_per_board" mempty False >>= return . eitherDecodeResponse

View File

@ -10,4 +10,3 @@ data ThreadMaxIdx = ThreadMaxIdx
{ thread_id :: Int64 { thread_id :: Int64
, max_idx :: Int , max_idx :: Int
} deriving (Show, Generic, FromJSON) } deriving (Show, Generic, FromJSON)

View File

@ -0,0 +1,20 @@
{-# LANGUAGE DeriveAnyClass #-}
module Network.GetLatestPostsPerBoardResponse
where
import Data.Int (Int64)
import Data.Time.Clock (UTCTime)
import Data.Aeson (FromJSON)
import GHC.Generics
data GetLatestPostsPerBoardResponse = GetLatestPostsPerBoardResponse
{ board_id :: Int
, site_id :: Int
, pathpart :: String
, post_id :: Maybe Int64
, board_post_id :: Int64
, creation_time :: UTCTime
, thread_id :: Int64
, board_thread_id :: Integer
} deriving (Show, Generic, FromJSON)

View File

@ -87,7 +87,7 @@ main = do
where where
pf :: (Show a, Show b) => (a, b) -> IO () pf :: (Show a, Show b) => (a, b) -> IO ()
pf (a, b) = putStrLn $ (show a) ++ "," ++ (show b) pf (a, b) = putStrLn $ show a ++ "," ++ show b
f _ (xs, gen) = f _ (xs, gen) =
let (x, newgen) = selectSkewedIndex (size q) gen let (x, newgen) = selectSkewedIndex (size q) gen
@ -97,5 +97,5 @@ main = do
q = fromList [ Elem i undefined | i <- [1..100] ] q = fromList [ Elem i undefined | i <- [1..100] ]
countOccurrences :: (Eq a, Ord a) => [a] -> [(a, Int)] countOccurrences :: (Eq a, Ord a) => [a] -> [(a, Int)]
countOccurrences rolls = map (\x -> (head x, length x)) . group . sort $ rolls countOccurrences = map (\x -> (head x, length x)) . group . sort

View File

@ -1,21 +1,31 @@
{-# LANGUAGE RecordWildCards #-}
module Sync where module Sync where
import Common.Server.ConsumerSettings import Common.Server.ConsumerSettings as Settings
import Lib (getBoards, toClientSettings) import Common.Server.JSONSettings as JSONSettings
import SitesType (Site) import Network.DataClient (getLatestPostsPerBoard)
import BoardsType (Board)
getSiteBoards :: ConsumerJSONSettings -> JSONSiteSettings -> IO (Site, [ Board ]) consumerSettingsToPartialJSONSettings :: Settings.ConsumerJSONSettings -> JSONSettings.JSONSettings
getSiteBoards settings site_settings = consumerSettingsToPartialJSONSettings ConsumerJSONSettings {..} =
let client_settings = toClientSettings settings site_settings JSONSettings
in getBoards { JSONSettings.postgrest_url = postgrest_url
client_settings , JSONSettings.jwt = jwt
(boards site_settings) , backup_read_root = undefined
, JSONSettings.media_root_path
, site_name = undefined
, site_url = undefined
}
syncWebsites :: ConsumerJSONSettings -> IO () syncWebsites :: ConsumerJSONSettings -> IO ()
syncWebsites _ = do syncWebsites consumer_settings = do
putStrLn "Starting channel web synchronization." putStrLn "Starting channel web synchronization."
let json_settings = consumerSettingsToPartialJSONSettings consumer_settings
asdf <- getLatestPostsPerBoard json_settings
print asdf
-- first we need all the (Site, Board) tuples -- first we need all the (Site, Board) tuples
-- perhaps we even want all (Site, Board, Thread) pairs -- perhaps we even want all (Site, Board, Thread) pairs
-- But then we don't load the posts of each thread, instead only do -- But then we don't load the posts of each thread, instead only do