Compute the local_idx of each Post

This commit is contained in:
towards-a-new-leftypol 2024-03-19 23:09:40 -04:00
parent 2b1940b401
commit 335be14f7a
5 changed files with 162 additions and 33 deletions

View File

@ -17,6 +17,7 @@ DROP TABLE IF EXISTS threads CASCADE;
DROP TABLE IF EXISTS posts CASCADE; DROP TABLE IF EXISTS posts CASCADE;
DROP TABLE IF EXISTS attachments CASCADE; DROP TABLE IF EXISTS attachments CASCADE;
DROP TYPE IF EXISTS catalog_grid_result CASCADE; DROP TYPE IF EXISTS catalog_grid_result CASCADE;
DROP TYPE IF EXISTS post_key CASCADE;
DROP FUNCTION IF EXISTS update_post_body_search_index; DROP FUNCTION IF EXISTS update_post_body_search_index;
DROP FUNCTION IF EXISTS fetch_top_threads; DROP FUNCTION IF EXISTS fetch_top_threads;
DROP FUNCTION IF EXISTS fetch_catalog; DROP FUNCTION IF EXISTS fetch_catalog;
@ -170,6 +171,8 @@ $$ LANGUAGE sql;
*/ */
-- Deprecated: this doesn't insert local_idx, seems better to just get all the posts in application
CREATE OR REPLACE FUNCTION insert_posts_and_return_ids(new_posts posts[]) CREATE OR REPLACE FUNCTION insert_posts_and_return_ids(new_posts posts[])
RETURNS TABLE (post_id bigint, board_post_id bigint, thread_id bigint) AS $$ RETURNS TABLE (post_id bigint, board_post_id bigint, thread_id bigint) AS $$
WITH WITH
@ -199,6 +202,21 @@ $$ LANGUAGE sql;
-- 1:21 for full db (nothing inserted) -- 1:21 for full db (nothing inserted)
CREATE TYPE post_key AS
( thread_id bigint
, board_post_id bigint
);
CREATE OR REPLACE FUNCTION get_posts(board_posts post_key[])
RETURNS SETOF posts AS $$
SELECT *
FROM posts
WHERE (thread_id, board_post_id) IN (SELECT thread_id, board_post_id FROM unnest(board_posts))
$$ LANGUAGE sql;
CREATE OR REPLACE FUNCTION fetch_top_threads( CREATE OR REPLACE FUNCTION fetch_top_threads(
p_start_time TIMESTAMPTZ, p_start_time TIMESTAMPTZ,
lookback INT DEFAULT 10000 lookback INT DEFAULT 10000
@ -345,6 +363,7 @@ REVOKE EXECUTE ON FUNCTION fetch_top_threads FROM PUBLIC;
REVOKE EXECUTE ON FUNCTION fetch_catalog FROM PUBLIC; REVOKE EXECUTE ON FUNCTION fetch_catalog FROM PUBLIC;
REVOKE EXECUTE ON FUNCTION search_posts FROM PUBLIC; REVOKE EXECUTE ON FUNCTION search_posts FROM PUBLIC;
REVOKE EXECUTE ON FUNCTION update_post_body_search_index FROM PUBLIC; REVOKE EXECUTE ON FUNCTION update_post_body_search_index FROM PUBLIC;
REVOKE EXECUTE ON FUNCTION get_posts FROM PUBLIC;
CREATE ROLE chan_archive_anon nologin; CREATE ROLE chan_archive_anon nologin;
GRANT CONNECT ON DATABASE chan_archives TO chan_archive_anon; GRANT CONNECT ON DATABASE chan_archives TO chan_archive_anon;
@ -356,6 +375,7 @@ GRANT SELECT ON attachments TO chan_archive_anon;
GRANT EXECUTE ON FUNCTION fetch_catalog TO chan_archive_anon; GRANT EXECUTE ON FUNCTION fetch_catalog TO chan_archive_anon;
GRANT EXECUTE ON FUNCTION fetch_top_threads TO chan_archive_anon; GRANT EXECUTE ON FUNCTION fetch_top_threads TO chan_archive_anon;
GRANT EXECUTE ON FUNCTION search_posts TO chan_archive_anon; GRANT EXECUTE ON FUNCTION search_posts TO chan_archive_anon;
GRANT EXECUTE ON FUNCTION get_posts TO chan_archive_anon;
-- GRANT usage, select ON SEQUENCE sites_site_id_seq TO chan_archive_anon; -- GRANT usage, select ON SEQUENCE sites_site_id_seq TO chan_archive_anon;
-- GRANT usage, select ON SEQUENCE boards_board_id_seq TO chan_archive_anon; -- GRANT usage, select ON SEQUENCE boards_board_id_seq TO chan_archive_anon;
@ -374,6 +394,7 @@ GRANT EXECUTE ON FUNCTION insert_posts_and_return_ids TO chan_archiver;
GRANT EXECUTE ON FUNCTION fetch_top_threads TO chan_archiver; GRANT EXECUTE ON FUNCTION fetch_top_threads TO chan_archiver;
GRANT EXECUTE ON FUNCTION fetch_catalog TO chan_archiver; GRANT EXECUTE ON FUNCTION fetch_catalog TO chan_archiver;
GRANT EXECUTE ON FUNCTION search_posts TO chan_archiver; GRANT EXECUTE ON FUNCTION search_posts TO chan_archiver;
GRANT EXECUTE ON FUNCTION get_posts TO chan_archiver;
GRANT usage, select ON SEQUENCE sites_site_id_seq TO chan_archiver; GRANT usage, select ON SEQUENCE sites_site_id_seq TO chan_archiver;
GRANT usage, select ON SEQUENCE boards_board_id_seq TO chan_archiver; GRANT usage, select ON SEQUENCE boards_board_id_seq TO chan_archiver;
GRANT usage, select ON SEQUENCE threads_thread_id_seq TO chan_archiver; GRANT usage, select ON SEQUENCE threads_thread_id_seq TO chan_archiver;

25
sql/make_get_posts.sql Normal file
View File

@ -0,0 +1,25 @@
BEGIN TRANSACTION;
DROP TYPE IF EXISTS post_key CASCADE;
-- DROP FUNCTION IF EXISTS get_posts;
CREATE TYPE post_key AS
( thread_id bigint
, board_post_id bigint
);
CREATE OR REPLACE FUNCTION get_posts(board_posts post_key[])
RETURNS SETOF posts AS $$
SELECT *
FROM posts
WHERE (thread_id, board_post_id) IN (SELECT thread_id, board_post_id FROM unnest(board_posts))
$$ LANGUAGE sql;
GRANT EXECUTE ON FUNCTION get_posts TO chan_archive_anon;
GRANT EXECUTE ON FUNCTION get_posts TO chan_archiver;
REVOKE EXECUTE ON FUNCTION get_posts FROM PUBLIC;
COMMIT;

View File

@ -18,7 +18,8 @@ import System.Directory
, createDirectoryIfMissing , createDirectoryIfMissing
) )
import System.FilePath ((</>), (<.>), takeExtension) import System.FilePath ((</>), (<.>), takeExtension)
import Data.List (find, isSuffixOf, foldl') import Data.List (find, isSuffixOf, foldl', sortBy)
import Data.Ord (comparing)
import qualified Data.Set as Set import qualified Data.Set as Set
import Data.Set (Set) import Data.Set (Set)
import Data.Time.Clock.POSIX (posixSecondsToUTCTime) import Data.Time.Clock.POSIX (posixSecondsToUTCTime)
@ -200,9 +201,12 @@ readPosts settings board thread = do
backupDir = backup_read_root settings </> Boards.pathpart board backupDir = backup_read_root settings </> Boards.pathpart board
apiPostToPostKey :: Threads.Thread -> JSONPosts.Post -> Client.PostId
apiPostToPostKey thread post = Client.PostId (Threads.thread_id thread) (JSONPosts.no post)
-- Convert Post to DbPost -- Convert Post to DbPost
apiPostToArchivePost :: Threads.Thread -> JSONPosts.Post -> Posts.Post apiPostToArchivePost :: Int -> Threads.Thread -> JSONPosts.Post -> Posts.Post
apiPostToArchivePost thread post = apiPostToArchivePost local_idx thread post =
Posts.Post Posts.Post
{ Posts.post_id = Nothing { Posts.post_id = Nothing
, Posts.board_post_id = JSONPosts.no post , Posts.board_post_id = JSONPosts.no post
@ -213,6 +217,7 @@ apiPostToArchivePost thread post =
, Posts.email = JSONPosts.email post , Posts.email = JSONPosts.email post
, Posts.thread_id = Threads.thread_id thread , Posts.thread_id = Threads.thread_id thread
, Posts.embed = JSONPosts.embed post , Posts.embed = JSONPosts.embed post
, Posts.local_idx = local_idx
} }
-- | A version of 'concatMap' that works with a monadic predicate. -- | A version of 'concatMap' that works with a monadic predicate.
@ -230,18 +235,19 @@ concatMapM op = foldr f (pure [])
pure $ x_ ++ xs_ pure $ x_ ++ xs_
setPostIdInPosts addPostsToTuples
:: [(Sites.Site, Boards.Board, Threads.Thread, JSONPosts.Post, Posts.Post)] :: [(Sites.Site, Boards.Board, Threads.Thread, JSONPosts.Post)]
-> [ Client.PostId ] -> [ Posts.Post ]
-> [(Sites.Site, Boards.Board, Threads.Thread, JSONPosts.Post, Posts.Post)] -> [(Sites.Site, Boards.Board, Threads.Thread, JSONPosts.Post, Posts.Post)]
setPostIdInPosts tuples ids = map f ids addPostsToTuples tuples posts = map f posts
where where
post_map :: Map.Map (Int64, Int64) (Sites.Site, Boards.Board, Threads.Thread, JSONPosts.Post, Posts.Post) post_map :: Map.Map (Int64, Int64) (Sites.Site, Boards.Board, Threads.Thread, JSONPosts.Post)
post_map = Map.fromList (map (\(a, b, c, i, j) -> ((Posts.thread_id j, Posts.board_post_id j), (a, b, c, i, j))) tuples) post_map = Map.fromList (map (\(a, b, c, d) -> ((Threads.thread_id c, JSONPosts.no d), (a, b, c, d))) tuples)
f :: Client.PostId -> (Sites.Site, Boards.Board, Threads.Thread, JSONPosts.Post, Posts.Post) f :: Posts.Post -> (Sites.Site, Boards.Board, Threads.Thread, JSONPosts.Post, Posts.Post)
f (Client.PostId { Client.post_id = asdf1, Client.thread_id = asdf2, Client.board_post_id = asdf3 }) = f new_post =
(\(a, b, c, i, j) -> (a, b, c, i, j { Posts.post_id = Just asdf1 })) (post_map Map.! (asdf2, asdf3)) (\(a, b, c, d) -> (a, b, c, d, new_post))
(post_map Map.! (Posts.thread_id new_post, Posts.board_post_id new_post))
fileToAttachment :: Int -> Posts.Post -> JS.File -> At.Attachment fileToAttachment :: Int -> Posts.Post -> JS.File -> At.Attachment
@ -520,6 +526,57 @@ processFiles settings tuples = do -- perfect just means that our posts have ids,
in Map.insert pid (x : l) accMap in Map.insert pid (x : l) accMap
createNewPosts
:: JSONSettings
-> [ (Threads.Thread, JSONPosts.Post, Client.PostId) ]
-> IO [ Posts.Post ]
createNewPosts settings tuples = do
existing_post_results <- Client.getPosts settings (map (\(_, _, c) -> c) tuples)
existing_posts <- either handleError return existing_post_results
let existing_set :: Set (Int64, Int64) = Set.fromList (map (\x -> (Posts.thread_id x, Posts.board_post_id x)) existing_posts)
let to_insert_list :: [ (Threads.Thread, JSONPosts.Post, Client.PostId) ] =
sortBy (comparing $ \(_, _, p) -> Client.board_post_id p) $
determineNew tuples existing_set
-- Map of thread_id to the largest local_idx value (which would be the number of the last post in the thread)
let local_idx :: Map.Map Int64 Int = foldr (Map.unionWith max) Map.empty $
map (\x -> Map.singleton (Posts.thread_id x) (Posts.local_idx x)) existing_posts
let insert_posts = fst $ foldl' foldFn ([], local_idx) to_insert_list
-- posts to insert are the posts that are not in existing_posts
-- so we create a Set (thread_id, board_post_id) ✓
-- then check every tuples against the set and the ones not in the set get added to a to_insert_list ✓
-- also for every tuples we need to compute a local_idx
-- so we create a Map index_map from thread_id to local_idx ✓
-- - for existing_posts
-- - need to compare posts already in the map with another post and keep the max local_idx ✓
-- to get the new local_idx, we must order the to_insert_list by board_post_id, and look up each entry ✓
posts_result <- Client.postPosts settings insert_posts
new_posts <- either handleError return posts_result
return $ existing_posts ++ new_posts
where
handleError err = print err >> exitFailure
determineNew :: [(Threads.Thread, JSONPosts.Post, Client.PostId)] -> Set (Int64, Int64) -> [(Threads.Thread, JSONPosts.Post, Client.PostId)]
determineNew ts existing_set = filter (\(_, _, c) -> Set.notMember (Client.thread_id c, Client.board_post_id c) existing_set) ts
foldFn :: ([Posts.Post], Map.Map Int64 Int) -> (Threads.Thread, JSONPosts.Post, Client.PostId) -> ([Posts.Post], Map.Map Int64 Int)
foldFn (posts, idx_map) (t, p, c) =
case Map.lookup thread_id idx_map of
Nothing -> ((post 1) : posts, Map.insert thread_id 1 idx_map)
Just i -> ((post (i + 1)) : posts, Map.insert thread_id (i + 1) idx_map)
where
post :: Int -> Posts.Post
post i = apiPostToArchivePost i t p
thread_id = Client.thread_id c
processBoard :: JSONSettings -> Sites.Site -> Boards.Board -> IO () processBoard :: JSONSettings -> Sites.Site -> Boards.Board -> IO ()
processBoard settings site board = do processBoard settings site board = do
let catalogPath = backupDir </> "catalog.json" let catalogPath = backupDir </> "catalog.json"
@ -535,21 +592,19 @@ processBoard settings site board = do
all_posts_on_board :: [(Threads.Thread, [ JSONPosts.Post ])] <- mapM (readPosts settings board) all_threads_for_board all_posts_on_board :: [(Threads.Thread, [ JSONPosts.Post ])] <- mapM (readPosts settings board) all_threads_for_board
let tuples :: [(Sites.Site, Boards.Board, Threads.Thread, JSONPosts.Post, Posts.Post)] = concatMap
(\(t, posts) -> map (\p -> (site, board, t, p, apiPostToArchivePost t p)) posts) let tuples :: [(Sites.Site, Boards.Board, Threads.Thread, JSONPosts.Post)] = concatMap
(\(t, posts) -> map (\p -> (site, board, t, p)) posts)
all_posts_on_board all_posts_on_board
posts_result <- Client.postPosts settings (map (\(_, _, _, _, d) -> d) tuples) posts_result :: [ Posts.Post ] <- createNewPosts settings (map (\(_, _, c, d) -> (c, d, apiPostToPostKey c d)) tuples)
case posts_result of
Left err -> print err
Right (new_ids :: [ Client.PostId ]) -> do
putStrLn "Sum of post_ids:" putStrLn "Sum of post_ids:"
print $ sum $ map Client.post_id new_ids print $ sum $ map (fromJust . Posts.post_id) posts_result
putStrLn "Sum of board_post_ids:" putStrLn "Sum of board_post_ids:"
print $ sum $ map Client.board_post_id new_ids print $ sum $ map Posts.board_post_id posts_result
let perfect_post_pairs = setPostIdInPosts tuples new_ids let perfect_post_pairs = addPostsToTuples tuples posts_result
processFiles settings perfect_post_pairs processFiles settings perfect_post_pairs

@ -1 +1 @@
Subproject commit 1b08b3ac079c6032e1090e79a8074d4cfd9ef24f Subproject commit aed106f40e24125d9057d3a9dda934a41c6d68a0

View File

@ -12,6 +12,7 @@ module DataClient
, postBoards , postBoards
, getThreads , getThreads
, postThreads , postThreads
, getPosts
, postPosts , postPosts
, getAttachments , getAttachments
, postAttachments , postAttachments
@ -26,6 +27,7 @@ import qualified Data.ByteString.Lazy.Char8 as LC8
import Data.List (intercalate) import Data.List (intercalate)
import Data.Aeson import Data.Aeson
( eitherDecode ( eitherDecode
, ToJSON
, FromJSON , FromJSON
, (.=) , (.=)
, object , object
@ -42,11 +44,12 @@ import qualified Common.AttachmentType as Attachments
import qualified Common.PostsType as Posts import qualified Common.PostsType as Posts
import Common.Network.HttpClient import Common.Network.HttpClient
data PostId = PostId data PostId = PostId
{ post_id :: Int64 { board_post_id :: Int64
, board_post_id :: Int64
, thread_id :: Int64 , thread_id :: Int64
} deriving (Show, Generic, FromJSON) } deriving (Show, Generic, ToJSON)
getSiteBoards :: T.JSONSettings -> Int -> IO (Either HttpError [ Boards.Board ]) getSiteBoards :: T.JSONSettings -> Int -> IO (Either HttpError [ Boards.Board ])
getSiteBoards settings site_id_ = get settings path >>= return . eitherDecodeResponse getSiteBoards settings site_id_ = get settings path >>= return . eitherDecodeResponse
@ -103,6 +106,7 @@ postThreads settings threads =
getAllSites :: T.JSONSettings -> IO (Either HttpError [ Sites.Site ]) getAllSites :: T.JSONSettings -> IO (Either HttpError [ Sites.Site ])
getAllSites settings = get settings "/sites" >>= return . eitherDecodeResponse getAllSites settings = get settings "/sites" >>= return . eitherDecodeResponse
getThreads :: T.JSONSettings -> Int -> [ Int ] -> IO (Either HttpError [ Threads.Thread ]) getThreads :: T.JSONSettings -> Int -> [ Int ] -> IO (Either HttpError [ Threads.Thread ])
getThreads settings board_id board_thread_ids = getThreads settings board_id board_thread_ids =
get settings path >>= return . eitherDecodeResponse get settings path >>= return . eitherDecodeResponse
@ -155,15 +159,39 @@ postAttachments settings attachments = do
payload = encode attachments payload = encode attachments
-- | Function to handle each chunk.
getPostsChunk :: T.JSONSettings -> [ PostId ] -> IO (Either HttpError [Posts.Post])
getPostsChunk settings chunk =
post settings "/rpc/get_posts" payload False >>= return . eitherDecodeResponse
where
payload = encode $ object [ "board_posts" .= chunk ]
getPosts :: T.JSONSettings -> [ PostId ] -> IO (Either HttpError [Posts.Post])
getPosts settings xs = do
results <- forM (chunkList chunkSize xs) (getPostsChunk settings)
return $ combineResults results
where
chunkSize = 1000
postPosts postPosts
:: T.JSONSettings :: T.JSONSettings
-> [ Posts.Post ] -> [ Posts.Post ]
-> IO (Either HttpError [ PostId ]) -> IO (Either HttpError [ Posts.Post ])
postPosts settings posts = postPosts settings posts = do
post settings "/rpc/insert_posts_and_return_ids" payload True >>= return . eitherDecodeResponse post settings "/posts" payload True >>= return . eitherDecodeResponse
where where
payload = encode $ object [ "new_posts" .= posts ] payload = encode posts
-- Old type:
-- postPosts
-- :: T.JSONSettings
-- -> [ Posts.Post ]
-- -> IO (Either HttpError [ PostId ])
eitherDecodeResponse :: (FromJSON a) => Either HttpError LBS.ByteString -> Either HttpError a eitherDecodeResponse :: (FromJSON a) => Either HttpError LBS.ByteString -> Either HttpError a