diff --git a/chan-delorean.cabal b/chan-delorean.cabal index 1b0a10f..0eb0ef9 100644 --- a/chan-delorean.cabal +++ b/chan-delorean.cabal @@ -67,7 +67,6 @@ executable chan-delorean -- Modules included in this executable, other than Main. other-modules: JSONParsing - DataClient JSONSettings SitesType BoardsType @@ -79,6 +78,8 @@ executable chan-delorean Common.Network.HttpClient Hash Data.WordUtil + Network.DataClient + Network.DataClientTypes -- LANGUAGE extensions used by modules in this package. -- other-extensions: diff --git a/sql/initialize.sql b/sql/initialize.sql index 7d1c40f..1dd865f 100644 --- a/sql/initialize.sql +++ b/sql/initialize.sql @@ -381,7 +381,8 @@ GRANT EXECUTE ON FUNCTION get_posts TO chan_archive_anon; -- GRANT usage, select ON SEQUENCE boards_board_id_seq TO chan_archive_anon; GRANT chan_archive_anon TO admin; -CREATE ROLE chan_archiver noinherit login password 'test_password'; +CREATE ROLE chan_archiver noinherit login password 'test_password' + SET pgrst.db_aggregates_enabled = 'true'; GRANT CONNECT ON DATABASE chan_archives TO chan_archiver; GRANT chan_archive_anon TO chan_archiver; GRANT ALL ON sites TO chan_archiver; diff --git a/src/Backfill.hs b/src/Backfill.hs index bb57ebb..5b8d267 100644 --- a/src/Backfill.hs +++ b/src/Backfill.hs @@ -36,7 +36,7 @@ import JSONParsing import JSONSettings import qualified JSONCommonTypes as JS import qualified JSONPost as JSONPosts -import qualified DataClient as Client +import qualified Network.DataClient as Client import qualified SitesType as Sites import qualified BoardsType as Boards import qualified ThreadType as Threads @@ -45,6 +45,8 @@ import qualified Common.PostsType as Posts import qualified Hash as Hash import qualified Data.WordUtil as Words +import Debug.Trace (trace, traceShowId) + newtype SettingsCLI = SettingsCLI { jsonFile :: FilePath } deriving (Show, Data, Typeable) @@ -202,7 +204,11 @@ readPosts settings board thread = do apiPostToPostKey :: Threads.Thread -> JSONPosts.Post -> Client.PostId -apiPostToPostKey thread post = Client.PostId (Threads.thread_id thread) (JSONPosts.no post) +apiPostToPostKey thread post = + Client.PostId + { Client.thread_id = (Threads.thread_id thread) + , Client.board_post_id = (JSONPosts.no post) + } -- Convert Post to DbPost apiPostToArchivePost :: Int -> Threads.Thread -> JSONPosts.Post -> Posts.Post @@ -531,18 +537,20 @@ createNewPosts -> [ (Threads.Thread, JSONPosts.Post, Client.PostId) ] -> IO [ Posts.Post ] createNewPosts settings tuples = do - existing_post_results <- Client.getPosts settings (map (\(_, _, c) -> c) tuples) + existing_post_results <- Client.getPosts settings $ map (\(_, _, c) -> c) tuples existing_posts <- either handleError return existing_post_results + thread_max_local_idx_result <- Client.getThreadMaxLocalIdx settings thread_ids + thread_max_local_idxs <- either handleError return thread_max_local_idx_result + let existing_set :: Set (Int64, Int64) = Set.fromList (map (\x -> (Posts.thread_id x, Posts.board_post_id x)) existing_posts) let to_insert_list :: [ (Threads.Thread, JSONPosts.Post, Client.PostId) ] = sortBy (comparing $ \(_, _, p) -> Client.board_post_id p) $ - determineNew tuples existing_set + newPosts tuples existing_set -- Map of thread_id to the largest local_idx value (which would be the number of the last post in the thread) - let local_idx :: Map.Map Int64 Int = foldr (Map.unionWith max) Map.empty $ - map (\x -> Map.singleton (Posts.thread_id x) (Posts.local_idx x)) existing_posts + let local_idx :: Map.Map Int64 Int = Map.fromList thread_max_local_idxs let insert_posts = fst $ foldl' foldFn ([], local_idx) to_insert_list @@ -555,6 +563,7 @@ createNewPosts settings tuples = do -- - need to compare posts already in the map with another post and keep the max local_idx ✓ -- to get the new local_idx, we must order the to_insert_list by board_post_id, and look up each entry ✓ + print insert_posts posts_result <- Client.postPosts settings insert_posts new_posts <- either handleError return posts_result return $ existing_posts ++ new_posts @@ -562,14 +571,17 @@ createNewPosts settings tuples = do where handleError err = print err >> exitFailure - determineNew :: [(Threads.Thread, JSONPosts.Post, Client.PostId)] -> Set (Int64, Int64) -> [(Threads.Thread, JSONPosts.Post, Client.PostId)] - determineNew ts existing_set = filter (\(_, _, c) -> Set.notMember (Client.thread_id c, Client.board_post_id c) existing_set) ts + thread_ids :: [ Int64 ] + thread_ids = Set.elems $ Set.fromList $ map (\(t, _, _) -> Threads.thread_id t) tuples + + newPosts :: [(Threads.Thread, JSONPosts.Post, Client.PostId)] -> Set (Int64, Int64) -> [(Threads.Thread, JSONPosts.Post, Client.PostId)] + newPosts ts existing_set = filter (\(_, _, c) -> Set.notMember (Client.thread_id c, Client.board_post_id c) existing_set) ts foldFn :: ([Posts.Post], Map.Map Int64 Int) -> (Threads.Thread, JSONPosts.Post, Client.PostId) -> ([Posts.Post], Map.Map Int64 Int) - foldFn (posts, idx_map) (t, p, c) = - case Map.lookup thread_id idx_map of - Nothing -> ((post 1) : posts, Map.insert thread_id 1 idx_map) - Just i -> ((post (i + 1)) : posts, Map.insert thread_id (i + 1) idx_map) + foldFn (posts, idx_map) (t, p, c) = trace "foldFn" $ + case Map.lookup thread_id (trace ("map size: " ++ (show $ length idx_map)) idx_map) of + Nothing -> (traceShowId (post 1) : posts, trace ("inserting_1 " ++ show thread_id ++ ": 1")$ Map.insert thread_id 1 idx_map) + Just i -> (traceShowId (post (i + 1)) : posts, trace ("inserting_2 " ++ show thread_id ++ ": " ++ show (i + 1)) $ Map.insert thread_id (i + 1) idx_map) where post :: Int -> Posts.Post @@ -666,4 +678,6 @@ main = do Nothing -> do putStrLn "Error: Invalid JSON format." exitFailure - Just settings -> processBackupDirectory settings + Just settings -> do + processBackupDirectory settings + putStrLn "Done!" diff --git a/src/Common b/src/Common index aed106f..ffe952e 160000 --- a/src/Common +++ b/src/Common @@ -1 +1 @@ -Subproject commit aed106f40e24125d9057d3a9dda934a41c6d68a0 +Subproject commit ffe952e22939b21f8576d79752c863a95c208e1e diff --git a/src/DataClient.hs b/src/Network/DataClient.hs similarity index 88% rename from src/DataClient.hs rename to src/Network/DataClient.hs index be013fe..4a1d32b 100644 --- a/src/DataClient.hs +++ b/src/Network/DataClient.hs @@ -1,7 +1,7 @@ {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE DeriveAnyClass #-} -module DataClient +module Network.DataClient ( HttpError(..) , PostId (..) , get @@ -11,6 +11,7 @@ module DataClient , post , postBoards , getThreads + , getThreadMaxLocalIdx , postThreads , getPosts , postPosts @@ -43,14 +44,14 @@ import qualified ThreadType as Threads import qualified Common.AttachmentType as Attachments import qualified Common.PostsType as Posts import Common.Network.HttpClient +import qualified Network.DataClientTypes as T data PostId = PostId - { board_post_id :: Int64 - , thread_id :: Int64 + { thread_id :: Int64 + , board_post_id :: Int64 } deriving (Show, Generic, ToJSON) - getSiteBoards :: T.JSONSettings -> Int -> IO (Either HttpError [ Boards.Board ]) getSiteBoards settings site_id_ = get settings path >>= return . eitherDecodeResponse where @@ -116,6 +117,19 @@ getThreads settings board_id board_thread_ids = ids :: String = intercalate "," $ map show board_thread_ids +getThreadMaxLocalIdx :: T.JSONSettings -> [ Int64 ] -> IO (Either HttpError [(Int64, Int)]) +getThreadMaxLocalIdx settings thread_ids = do + result :: Either HttpError [ T.ThreadMaxIdx ] <- get settings path >>= return . eitherDecodeResponse + + let results = result >>= \x -> return $ map (\t -> (T.thread_id t, T.max_idx t)) x + + return results + + where + path = "/posts?select=thread_id,max_idx:local_idx.max()&thread_id=in.(" ++ ids ++ ")" + ids :: String = intercalate "," $ map show thread_ids + + -- | Splits a list into chunks of a given size. chunkList :: Int -> [a] -> [[a]] chunkList _ [] = [] @@ -172,6 +186,7 @@ getPosts :: T.JSONSettings -> [ PostId ] -> IO (Either HttpError [Posts.Post]) getPosts settings xs = do results <- forM (chunkList chunkSize xs) (getPostsChunk settings) return $ combineResults results + where chunkSize = 1000 @@ -180,7 +195,7 @@ postPosts :: T.JSONSettings -> [ Posts.Post ] -> IO (Either HttpError [ Posts.Post ]) -postPosts settings posts = do +postPosts settings posts = post settings "/posts" payload True >>= return . eitherDecodeResponse where diff --git a/src/Network/DataClientTypes.hs b/src/Network/DataClientTypes.hs new file mode 100644 index 0000000..c74d0e2 --- /dev/null +++ b/src/Network/DataClientTypes.hs @@ -0,0 +1,13 @@ +{-# LANGUAGE DeriveAnyClass #-} + +module Network.DataClientTypes where + +import Data.Int (Int64) +import Data.Aeson (FromJSON) +import GHC.Generics + +data ThreadMaxIdx = ThreadMaxIdx + { thread_id :: Int64 + , max_idx :: Int + } deriving (Show, Generic, FromJSON) +