Fix ensuring correct local_idx for posts

- query each thread's max local_idx before processing
This commit is contained in:
towards-a-new-leftypol 2024-03-20 04:56:53 -04:00
parent 335be14f7a
commit 7c7c6df955
6 changed files with 65 additions and 21 deletions

View File

@ -67,7 +67,6 @@ executable chan-delorean
-- Modules included in this executable, other than Main. -- Modules included in this executable, other than Main.
other-modules: other-modules:
JSONParsing JSONParsing
DataClient
JSONSettings JSONSettings
SitesType SitesType
BoardsType BoardsType
@ -79,6 +78,8 @@ executable chan-delorean
Common.Network.HttpClient Common.Network.HttpClient
Hash Hash
Data.WordUtil Data.WordUtil
Network.DataClient
Network.DataClientTypes
-- LANGUAGE extensions used by modules in this package. -- LANGUAGE extensions used by modules in this package.
-- other-extensions: -- other-extensions:

View File

@ -381,7 +381,8 @@ GRANT EXECUTE ON FUNCTION get_posts TO chan_archive_anon;
-- GRANT usage, select ON SEQUENCE boards_board_id_seq TO chan_archive_anon; -- GRANT usage, select ON SEQUENCE boards_board_id_seq TO chan_archive_anon;
GRANT chan_archive_anon TO admin; GRANT chan_archive_anon TO admin;
CREATE ROLE chan_archiver noinherit login password 'test_password'; CREATE ROLE chan_archiver noinherit login password 'test_password'
SET pgrst.db_aggregates_enabled = 'true';
GRANT CONNECT ON DATABASE chan_archives TO chan_archiver; GRANT CONNECT ON DATABASE chan_archives TO chan_archiver;
GRANT chan_archive_anon TO chan_archiver; GRANT chan_archive_anon TO chan_archiver;
GRANT ALL ON sites TO chan_archiver; GRANT ALL ON sites TO chan_archiver;

View File

@ -36,7 +36,7 @@ import JSONParsing
import JSONSettings import JSONSettings
import qualified JSONCommonTypes as JS import qualified JSONCommonTypes as JS
import qualified JSONPost as JSONPosts import qualified JSONPost as JSONPosts
import qualified DataClient as Client import qualified Network.DataClient as Client
import qualified SitesType as Sites import qualified SitesType as Sites
import qualified BoardsType as Boards import qualified BoardsType as Boards
import qualified ThreadType as Threads import qualified ThreadType as Threads
@ -45,6 +45,8 @@ import qualified Common.PostsType as Posts
import qualified Hash as Hash import qualified Hash as Hash
import qualified Data.WordUtil as Words import qualified Data.WordUtil as Words
import Debug.Trace (trace, traceShowId)
newtype SettingsCLI = SettingsCLI newtype SettingsCLI = SettingsCLI
{ jsonFile :: FilePath { jsonFile :: FilePath
} deriving (Show, Data, Typeable) } deriving (Show, Data, Typeable)
@ -202,7 +204,11 @@ readPosts settings board thread = do
apiPostToPostKey :: Threads.Thread -> JSONPosts.Post -> Client.PostId apiPostToPostKey :: Threads.Thread -> JSONPosts.Post -> Client.PostId
apiPostToPostKey thread post = Client.PostId (Threads.thread_id thread) (JSONPosts.no post) apiPostToPostKey thread post =
Client.PostId
{ Client.thread_id = (Threads.thread_id thread)
, Client.board_post_id = (JSONPosts.no post)
}
-- Convert Post to DbPost -- Convert Post to DbPost
apiPostToArchivePost :: Int -> Threads.Thread -> JSONPosts.Post -> Posts.Post apiPostToArchivePost :: Int -> Threads.Thread -> JSONPosts.Post -> Posts.Post
@ -531,18 +537,20 @@ createNewPosts
-> [ (Threads.Thread, JSONPosts.Post, Client.PostId) ] -> [ (Threads.Thread, JSONPosts.Post, Client.PostId) ]
-> IO [ Posts.Post ] -> IO [ Posts.Post ]
createNewPosts settings tuples = do createNewPosts settings tuples = do
existing_post_results <- Client.getPosts settings (map (\(_, _, c) -> c) tuples) existing_post_results <- Client.getPosts settings $ map (\(_, _, c) -> c) tuples
existing_posts <- either handleError return existing_post_results existing_posts <- either handleError return existing_post_results
thread_max_local_idx_result <- Client.getThreadMaxLocalIdx settings thread_ids
thread_max_local_idxs <- either handleError return thread_max_local_idx_result
let existing_set :: Set (Int64, Int64) = Set.fromList (map (\x -> (Posts.thread_id x, Posts.board_post_id x)) existing_posts) let existing_set :: Set (Int64, Int64) = Set.fromList (map (\x -> (Posts.thread_id x, Posts.board_post_id x)) existing_posts)
let to_insert_list :: [ (Threads.Thread, JSONPosts.Post, Client.PostId) ] = let to_insert_list :: [ (Threads.Thread, JSONPosts.Post, Client.PostId) ] =
sortBy (comparing $ \(_, _, p) -> Client.board_post_id p) $ sortBy (comparing $ \(_, _, p) -> Client.board_post_id p) $
determineNew tuples existing_set newPosts tuples existing_set
-- Map of thread_id to the largest local_idx value (which would be the number of the last post in the thread) -- Map of thread_id to the largest local_idx value (which would be the number of the last post in the thread)
let local_idx :: Map.Map Int64 Int = foldr (Map.unionWith max) Map.empty $ let local_idx :: Map.Map Int64 Int = Map.fromList thread_max_local_idxs
map (\x -> Map.singleton (Posts.thread_id x) (Posts.local_idx x)) existing_posts
let insert_posts = fst $ foldl' foldFn ([], local_idx) to_insert_list let insert_posts = fst $ foldl' foldFn ([], local_idx) to_insert_list
@ -555,6 +563,7 @@ createNewPosts settings tuples = do
-- - need to compare posts already in the map with another post and keep the max local_idx ✓ -- - need to compare posts already in the map with another post and keep the max local_idx ✓
-- to get the new local_idx, we must order the to_insert_list by board_post_id, and look up each entry ✓ -- to get the new local_idx, we must order the to_insert_list by board_post_id, and look up each entry ✓
print insert_posts
posts_result <- Client.postPosts settings insert_posts posts_result <- Client.postPosts settings insert_posts
new_posts <- either handleError return posts_result new_posts <- either handleError return posts_result
return $ existing_posts ++ new_posts return $ existing_posts ++ new_posts
@ -562,14 +571,17 @@ createNewPosts settings tuples = do
where where
handleError err = print err >> exitFailure handleError err = print err >> exitFailure
determineNew :: [(Threads.Thread, JSONPosts.Post, Client.PostId)] -> Set (Int64, Int64) -> [(Threads.Thread, JSONPosts.Post, Client.PostId)] thread_ids :: [ Int64 ]
determineNew ts existing_set = filter (\(_, _, c) -> Set.notMember (Client.thread_id c, Client.board_post_id c) existing_set) ts thread_ids = Set.elems $ Set.fromList $ map (\(t, _, _) -> Threads.thread_id t) tuples
newPosts :: [(Threads.Thread, JSONPosts.Post, Client.PostId)] -> Set (Int64, Int64) -> [(Threads.Thread, JSONPosts.Post, Client.PostId)]
newPosts ts existing_set = filter (\(_, _, c) -> Set.notMember (Client.thread_id c, Client.board_post_id c) existing_set) ts
foldFn :: ([Posts.Post], Map.Map Int64 Int) -> (Threads.Thread, JSONPosts.Post, Client.PostId) -> ([Posts.Post], Map.Map Int64 Int) foldFn :: ([Posts.Post], Map.Map Int64 Int) -> (Threads.Thread, JSONPosts.Post, Client.PostId) -> ([Posts.Post], Map.Map Int64 Int)
foldFn (posts, idx_map) (t, p, c) = foldFn (posts, idx_map) (t, p, c) = trace "foldFn" $
case Map.lookup thread_id idx_map of case Map.lookup thread_id (trace ("map size: " ++ (show $ length idx_map)) idx_map) of
Nothing -> ((post 1) : posts, Map.insert thread_id 1 idx_map) Nothing -> (traceShowId (post 1) : posts, trace ("inserting_1 " ++ show thread_id ++ ": 1")$ Map.insert thread_id 1 idx_map)
Just i -> ((post (i + 1)) : posts, Map.insert thread_id (i + 1) idx_map) Just i -> (traceShowId (post (i + 1)) : posts, trace ("inserting_2 " ++ show thread_id ++ ": " ++ show (i + 1)) $ Map.insert thread_id (i + 1) idx_map)
where where
post :: Int -> Posts.Post post :: Int -> Posts.Post
@ -666,4 +678,6 @@ main = do
Nothing -> do Nothing -> do
putStrLn "Error: Invalid JSON format." putStrLn "Error: Invalid JSON format."
exitFailure exitFailure
Just settings -> processBackupDirectory settings Just settings -> do
processBackupDirectory settings
putStrLn "Done!"

@ -1 +1 @@
Subproject commit aed106f40e24125d9057d3a9dda934a41c6d68a0 Subproject commit ffe952e22939b21f8576d79752c863a95c208e1e

View File

@ -1,7 +1,7 @@
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE DeriveAnyClass #-} {-# LANGUAGE DeriveAnyClass #-}
module DataClient module Network.DataClient
( HttpError(..) ( HttpError(..)
, PostId (..) , PostId (..)
, get , get
@ -11,6 +11,7 @@ module DataClient
, post , post
, postBoards , postBoards
, getThreads , getThreads
, getThreadMaxLocalIdx
, postThreads , postThreads
, getPosts , getPosts
, postPosts , postPosts
@ -43,14 +44,14 @@ import qualified ThreadType as Threads
import qualified Common.AttachmentType as Attachments import qualified Common.AttachmentType as Attachments
import qualified Common.PostsType as Posts import qualified Common.PostsType as Posts
import Common.Network.HttpClient import Common.Network.HttpClient
import qualified Network.DataClientTypes as T
data PostId = PostId data PostId = PostId
{ board_post_id :: Int64 { thread_id :: Int64
, thread_id :: Int64 , board_post_id :: Int64
} deriving (Show, Generic, ToJSON) } deriving (Show, Generic, ToJSON)
getSiteBoards :: T.JSONSettings -> Int -> IO (Either HttpError [ Boards.Board ]) getSiteBoards :: T.JSONSettings -> Int -> IO (Either HttpError [ Boards.Board ])
getSiteBoards settings site_id_ = get settings path >>= return . eitherDecodeResponse getSiteBoards settings site_id_ = get settings path >>= return . eitherDecodeResponse
where where
@ -116,6 +117,19 @@ getThreads settings board_id board_thread_ids =
ids :: String = intercalate "," $ map show board_thread_ids ids :: String = intercalate "," $ map show board_thread_ids
getThreadMaxLocalIdx :: T.JSONSettings -> [ Int64 ] -> IO (Either HttpError [(Int64, Int)])
getThreadMaxLocalIdx settings thread_ids = do
result :: Either HttpError [ T.ThreadMaxIdx ] <- get settings path >>= return . eitherDecodeResponse
let results = result >>= \x -> return $ map (\t -> (T.thread_id t, T.max_idx t)) x
return results
where
path = "/posts?select=thread_id,max_idx:local_idx.max()&thread_id=in.(" ++ ids ++ ")"
ids :: String = intercalate "," $ map show thread_ids
-- | Splits a list into chunks of a given size. -- | Splits a list into chunks of a given size.
chunkList :: Int -> [a] -> [[a]] chunkList :: Int -> [a] -> [[a]]
chunkList _ [] = [] chunkList _ [] = []
@ -172,6 +186,7 @@ getPosts :: T.JSONSettings -> [ PostId ] -> IO (Either HttpError [Posts.Post])
getPosts settings xs = do getPosts settings xs = do
results <- forM (chunkList chunkSize xs) (getPostsChunk settings) results <- forM (chunkList chunkSize xs) (getPostsChunk settings)
return $ combineResults results return $ combineResults results
where where
chunkSize = 1000 chunkSize = 1000
@ -180,7 +195,7 @@ postPosts
:: T.JSONSettings :: T.JSONSettings
-> [ Posts.Post ] -> [ Posts.Post ]
-> IO (Either HttpError [ Posts.Post ]) -> IO (Either HttpError [ Posts.Post ])
postPosts settings posts = do postPosts settings posts =
post settings "/posts" payload True >>= return . eitherDecodeResponse post settings "/posts" payload True >>= return . eitherDecodeResponse
where where

View File

@ -0,0 +1,13 @@
{-# LANGUAGE DeriveAnyClass #-}
module Network.DataClientTypes where
import Data.Int (Int64)
import Data.Aeson (FromJSON)
import GHC.Generics
data ThreadMaxIdx = ThreadMaxIdx
{ thread_id :: Int64
, max_idx :: Int
} deriving (Show, Generic, FromJSON)