Compare commits

...

3 Commits

Author SHA1 Message Date
towards-a-new-leftypol 1113539321 Sync starts by getting latest post for each board 2025-01-30 18:00:42 -05:00
towards-a-new-leftypol 518467c7eb Add sql function to get the last post on each board 2025-01-29 22:50:28 -05:00
towards-a-new-leftypol 7bf61c0dd2 fetch_catalog should be in sql/initialize.sql
- copy it from the experimental remake_fetch_catalog.sql script
2025-01-29 18:27:57 -05:00
9 changed files with 252 additions and 36 deletions

View File

@ -80,6 +80,7 @@ executable chan-delorean
Data.WordUtil
Network.DataClient
Network.DataClientTypes
Network.GetLatestPostsPerBoardResponse
Common.Server.JSONSettings
Common.Server.ConsumerSettings

View File

@ -1,13 +1,17 @@
{
"websites": {
"name": "example",
"root_url": "https://example.net",
"boards": [
"tech",
"meta"
]
},
"websites": [
{
"name": "example",
"root_url": "https://example.net",
"boards": [
"tech",
"meta"
]
}
],
"postgrest_url": "http://localhost:3000",
"jwt": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJyb2xlIjoiY2hhbl9hcmNoaXZlciJ9.rGIKZokTDKTuQLIv8138bUby5PELfDipYYIDpJzH02c",
"media_root_path": "/home/phil/linixy/tmp/chan_archive_media2/archive"
"media_root_path": "/home/phil/linixy/tmp/chan_archive_media_repaired/archive",
"http_fill_all": false,
"http_sync_continously": true
}

View File

@ -222,3 +222,99 @@ SELECT * FROM boards;
SELECT * FROM threads WHERE thread_id = 11314;
ANALYZE posts;
SELECT count(*) from attachments;
SELECT * FROM attachments WHERE post_id = 253383;
SELECT * from attachments WHERE board_filename = '1722466065515';
SELECT count(*) attachments WHERE attachment_id < (SELECT attachment_id FROM attachments WHERE board_filename = '1722466065515');
SELECT max(attachment_id) FROM attachments a;
SELECT pg_get_serial_sequence('attachments', 'attachment_id');
SELECT setval(pg_get_serial_sequence('attachments', 'attachment_id'), COALESCE(198853, 1), true);
UPDATE attachments SET thumb_extension = 'png'
WHERE
attachment_id IN
(
SELECT a.attachment_id
FROM attachments a
JOIN posts p ON a.post_id = p.post_id
JOIN threads t ON p.thread_id = t.thread_id
JOIN boards b ON t.board_id = b.board_id
JOIN sites s ON b.site_id = s.site_id
WHERE s.name = 'leftychan'
AND a.thumb_extension = 'jpg'
);
SELECT * FROM posts WHERE board_post_id = 129;
SELECT * FROM attachments WHERE post_id = 461287;
SELECT count(a.*)
FROM attachments a
JOIN posts p ON a.post_id = p.post_id
JOIN threads t ON p.thread_id = t.thread_id
JOIN boards b ON t.board_id = b.board_id
JOIN sites s ON b.site_id = s.site_id
WHERE s.name = 'leftychan'
AND a.thumb_extension = 'jpg';
SELECT * FROM posts
JOIN threads ON threads.thread_id = posts.thread_id
JOIN boards ON boards.board_id = threads.board_id
WHERE boards.pathpart = 'leftypol'
AND boards.site_id = 1
ORDER BY posts.creation_time DESC
LIMIT 1;
SELECT * FROM posts
ORDER BY posts.creation_time DESC
LIMIT 1;
SELECT boards.board_id, boards.pathpart, sites.name FROM boards JOIN sites ON sites.site_id = boards.site_id;
SELECT DISTINCT ON (b.board_id)
b.board_id,
b.site_id,
b.pathpart,
p.post_id,
p.board_post_id,
p.creation_time,
p.body,
t.thread_id,
t.board_thread_id
FROM boards b
JOIN threads t ON t.board_id = b.board_id
JOIN posts p ON p.thread_id = t.thread_id
ORDER BY b.board_id, p.creation_time DESC;
CREATE OR REPLACE FUNCTION get_latest_posts_per_board()
RETURNS TABLE (
board_id int,
site_id int,
pathpart text,
post_id bigint,
board_post_id bigint,
creation_time timestamp with time zone,
thread_id bigint,
board_thread_id bigint
) AS $$
SELECT DISTINCT ON (b.board_id)
b.board_id,
b.site_id,
b.pathpart,
p.post_id,
p.board_post_id,
p.creation_time,
t.thread_id,
t.board_thread_id
FROM boards b
JOIN threads t ON t.board_id = b.board_id
JOIN posts p ON p.thread_id = t.thread_id
ORDER BY b.board_id, p.creation_time DESC;
$$ LANGUAGE sql STABLE;
SELECT * FROM get_latest_posts_per_board();

View File

@ -21,6 +21,7 @@ DROP TYPE IF EXISTS post_key CASCADE;
DROP FUNCTION IF EXISTS update_post_body_search_index;
DROP FUNCTION IF EXISTS fetch_top_threads;
DROP FUNCTION IF EXISTS fetch_catalog;
DROP FUNCTION IF EXISTS get_latest_posts_per_board;
-- It won't let us drop roles otherwise and the IFs are to keep this script idempotent.
@ -222,7 +223,7 @@ CREATE OR REPLACE FUNCTION fetch_top_threads(
lookback INT DEFAULT 10000
)
RETURNS TABLE(bump_time TIMESTAMPTZ, post_count BIGINT, thread_id BIGINT, where_to_leave_off TIMESTAMPTZ)
LANGUAGE sql
LANGUAGE sql STABLE
AS $$
SELECT
max(creation_time) as bump_time,
@ -266,6 +267,51 @@ CREATE TYPE catalog_grid_result AS
);
CREATE OR REPLACE FUNCTION fetch_catalog(max_time timestamptz, max_row_read int DEFAULT 10000)
RETURNS SETOF catalog_grid_result AS $$
WITH
top AS
(
SELECT * FROM fetch_top_threads(max_time, max_row_read) AS top
),
tall_posts AS
(
SELECT
top.post_count AS estimated_post_count,
posts.post_id,
posts.board_post_id,
posts.creation_time,
top.bump_time,
posts.body,
posts.subject,
posts.thread_id,
posts.embed
FROM top
JOIN posts ON top.thread_id = posts.thread_id AND posts.local_idx = 1
WHERE creation_time < max_time
)
SELECT
-- post_counts.post_count,
tall_posts.*,
threads.board_thread_id, -- this should be part of the url path when creating links, not thread_id (that's internal)
boards.pathpart,
sites."name",
-- sites.site_id,
attachments.mimetype AS file_mimetype,
attachments.illegal AS file_illegal,
-- attachments.resolution AS file_resolution,
attachments.board_filename AS file_name,
attachments.file_extension,
attachments.thumb_extension AS file_thumb_extension
FROM tall_posts
JOIN threads ON tall_posts.thread_id = threads.thread_id
JOIN boards ON threads.board_id = boards.board_id
JOIN sites ON sites.site_id = boards.site_id
LEFT OUTER JOIN attachments ON attachments.post_id = tall_posts.post_id AND attachments.attachment_idx = 1
ORDER BY bump_time DESC;
$$ LANGUAGE sql STABLE;
-- Function: search_posts
--
-- This function performs a full-text search on the `posts` table using PostgreSQL's text search features.
@ -355,6 +401,33 @@ RETURNS SETOF catalog_grid_result AS $$
$$ LANGUAGE sql STABLE;
CREATE OR REPLACE FUNCTION get_latest_posts_per_board()
RETURNS TABLE (
board_id int,
site_id int,
pathpart text,
post_id bigint,
board_post_id bigint,
creation_time timestamp with time zone,
thread_id bigint,
board_thread_id bigint
) AS $$
SELECT DISTINCT ON (b.board_id)
b.board_id,
b.site_id,
b.pathpart,
p.post_id,
p.board_post_id,
p.creation_time,
t.thread_id,
t.board_thread_id
FROM boards b
JOIN threads t ON t.board_id = b.board_id
JOIN posts p ON p.thread_id = t.thread_id
ORDER BY b.board_id, p.creation_time DESC;
$$ LANGUAGE sql STABLE;
/*
* Permissions
*/
@ -364,18 +437,20 @@ REVOKE EXECUTE ON FUNCTION fetch_catalog FROM PUBLIC;
REVOKE EXECUTE ON FUNCTION search_posts FROM PUBLIC;
REVOKE EXECUTE ON FUNCTION update_post_body_search_index FROM PUBLIC;
REVOKE EXECUTE ON FUNCTION get_posts FROM PUBLIC;
REVOKE EXECUTE ON FUNCTION get_latest_posts_per_board FROM PUBLIC;
CREATE ROLE chan_archive_anon nologin;
GRANT CONNECT ON DATABASE chan_archives TO chan_archive_anon;
GRANT SELECT ON sites TO chan_archive_anon;
GRANT SELECT ON boards TO chan_archive_anon;
GRANT SELECT ON threads TO chan_archive_anon;
GRANT SELECT ON posts TO chan_archive_anon;
GRANT SELECT ON attachments TO chan_archive_anon;
GRANT EXECUTE ON FUNCTION fetch_catalog TO chan_archive_anon;
GRANT EXECUTE ON FUNCTION fetch_top_threads TO chan_archive_anon;
GRANT EXECUTE ON FUNCTION search_posts TO chan_archive_anon;
GRANT EXECUTE ON FUNCTION get_posts TO chan_archive_anon;
GRANT CONNECT ON DATABASE chan_archives TO chan_archive_anon;
GRANT SELECT ON sites TO chan_archive_anon;
GRANT SELECT ON boards TO chan_archive_anon;
GRANT SELECT ON threads TO chan_archive_anon;
GRANT SELECT ON posts TO chan_archive_anon;
GRANT SELECT ON attachments TO chan_archive_anon;
GRANT EXECUTE ON FUNCTION fetch_catalog TO chan_archive_anon;
GRANT EXECUTE ON FUNCTION fetch_top_threads TO chan_archive_anon;
GRANT EXECUTE ON FUNCTION search_posts TO chan_archive_anon;
GRANT EXECUTE ON FUNCTION get_posts TO chan_archive_anon;
GRANT EXECUTE ON FUNCTION get_latest_posts_per_board TO chan_archive_anon;
-- GRANT usage, select ON SEQUENCE sites_site_id_seq TO chan_archive_anon;
-- GRANT usage, select ON SEQUENCE boards_board_id_seq TO chan_archive_anon;
@ -396,6 +471,7 @@ GRANT EXECUTE ON FUNCTION fetch_top_threads TO chan_archiver;
GRANT EXECUTE ON FUNCTION fetch_catalog TO chan_archiver;
GRANT EXECUTE ON FUNCTION search_posts TO chan_archiver;
GRANT EXECUTE ON FUNCTION get_posts TO chan_archiver;
GRANT EXECUTE ON FUNCTION get_latest_posts_per_board TO chan_archiver;
GRANT usage, select ON SEQUENCE sites_site_id_seq TO chan_archiver;
GRANT usage, select ON SEQUENCE boards_board_id_seq TO chan_archiver;
GRANT usage, select ON SEQUENCE threads_thread_id_seq TO chan_archiver;

View File

@ -1,5 +1,7 @@
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE DeriveAnyClass #-}
{-# OPTIONS_GHC -Wno-unrecognised-pragmas #-}
{-# HLINT ignore "Use <&>" #-}
module Network.DataClient
( HttpError(..)
@ -19,6 +21,7 @@ module Network.DataClient
, postAttachments
, getJSON
, getFile
, getLatestPostsPerBoard
) where
import Control.Monad (forM)
@ -49,6 +52,7 @@ import qualified Common.AttachmentType as Attachments
import qualified Common.PostsType as Posts
import Common.Network.HttpClient
import qualified Network.DataClientTypes as T
import qualified Network.GetLatestPostsPerBoardResponse as GLPPBR
data PostId = PostId
@ -233,14 +237,20 @@ getFile url = do
case result of
Left (err :: HttpError) -> do
putStrLn $ "getFile " ++ url ++ " Error!"
putStrLn $ show err
print err
return Nothing
Right lbs -> do
putStrLn $ "getFile " ++ url ++ " SUCCESS!"
tmp_root <- getCanonicalTemporaryDirectory
(tmp_filepath, tmp_filehandle) <- openBinaryTempFile tmp_root "chan.attachment"
putStrLn $ "Created " ++ tmp_filepath
putStrLn $ "Writing attachment..."
putStrLn "Writing attachment..."
LBS.hPut tmp_filehandle lbs
hClose tmp_filehandle
return $ Just tmp_filepath
-- | Function to handle each chunk.
getLatestPostsPerBoard :: T.JSONSettings -> IO (Either HttpError [ GLPPBR.GetLatestPostsPerBoardResponse ])
getLatestPostsPerBoard settings =
post settings "/rpc/get_latest_posts_per_board" mempty False >>= return . eitherDecodeResponse

View File

@ -10,4 +10,3 @@ data ThreadMaxIdx = ThreadMaxIdx
{ thread_id :: Int64
, max_idx :: Int
} deriving (Show, Generic, FromJSON)

View File

@ -0,0 +1,20 @@
{-# LANGUAGE DeriveAnyClass #-}
module Network.GetLatestPostsPerBoardResponse
where
import Data.Int (Int64)
import Data.Time.Clock (UTCTime)
import Data.Aeson (FromJSON)
import GHC.Generics
data GetLatestPostsPerBoardResponse = GetLatestPostsPerBoardResponse
{ board_id :: Int
, site_id :: Int
, pathpart :: String
, post_id :: Maybe Int64
, board_post_id :: Int64
, creation_time :: UTCTime
, thread_id :: Int64
, board_thread_id :: Integer
} deriving (Show, Generic, FromJSON)

View File

@ -87,7 +87,7 @@ main = do
where
pf :: (Show a, Show b) => (a, b) -> IO ()
pf (a, b) = putStrLn $ (show a) ++ "," ++ (show b)
pf (a, b) = putStrLn $ show a ++ "," ++ show b
f _ (xs, gen) =
let (x, newgen) = selectSkewedIndex (size q) gen
@ -97,5 +97,5 @@ main = do
q = fromList [ Elem i undefined | i <- [1..100] ]
countOccurrences :: (Eq a, Ord a) => [a] -> [(a, Int)]
countOccurrences rolls = map (\x -> (head x, length x)) . group . sort $ rolls
countOccurrences = map (\x -> (head x, length x)) . group . sort

View File

@ -1,21 +1,31 @@
{-# LANGUAGE RecordWildCards #-}
module Sync where
import Common.Server.ConsumerSettings
import Lib (getBoards, toClientSettings)
import SitesType (Site)
import BoardsType (Board)
import Common.Server.ConsumerSettings as Settings
import Common.Server.JSONSettings as JSONSettings
import Network.DataClient (getLatestPostsPerBoard)
getSiteBoards :: ConsumerJSONSettings -> JSONSiteSettings -> IO (Site, [ Board ])
getSiteBoards settings site_settings =
let client_settings = toClientSettings settings site_settings
in getBoards
client_settings
(boards site_settings)
consumerSettingsToPartialJSONSettings :: Settings.ConsumerJSONSettings -> JSONSettings.JSONSettings
consumerSettingsToPartialJSONSettings ConsumerJSONSettings {..} =
JSONSettings
{ JSONSettings.postgrest_url = postgrest_url
, JSONSettings.jwt = jwt
, backup_read_root = undefined
, JSONSettings.media_root_path
, site_name = undefined
, site_url = undefined
}
syncWebsites :: ConsumerJSONSettings -> IO ()
syncWebsites _ = do
syncWebsites consumer_settings = do
putStrLn "Starting channel web synchronization."
let json_settings = consumerSettingsToPartialJSONSettings consumer_settings
asdf <- getLatestPostsPerBoard json_settings
print asdf
-- first we need all the (Site, Board) tuples
-- perhaps we even want all (Site, Board, Thread) pairs
-- But then we don't load the posts of each thread, instead only do