Fix ensuring correct local_idx for posts

- query each thread's max local_idx before processing
This commit is contained in:
towards-a-new-leftypol 2024-03-20 04:56:53 -04:00
parent 335be14f7a
commit 7c7c6df955
6 changed files with 65 additions and 21 deletions

View File

@ -67,7 +67,6 @@ executable chan-delorean
-- Modules included in this executable, other than Main.
other-modules:
JSONParsing
DataClient
JSONSettings
SitesType
BoardsType
@ -79,6 +78,8 @@ executable chan-delorean
Common.Network.HttpClient
Hash
Data.WordUtil
Network.DataClient
Network.DataClientTypes
-- LANGUAGE extensions used by modules in this package.
-- other-extensions:

View File

@ -381,7 +381,8 @@ GRANT EXECUTE ON FUNCTION get_posts TO chan_archive_anon;
-- GRANT usage, select ON SEQUENCE boards_board_id_seq TO chan_archive_anon;
GRANT chan_archive_anon TO admin;
CREATE ROLE chan_archiver noinherit login password 'test_password';
CREATE ROLE chan_archiver noinherit login password 'test_password'
SET pgrst.db_aggregates_enabled = 'true';
GRANT CONNECT ON DATABASE chan_archives TO chan_archiver;
GRANT chan_archive_anon TO chan_archiver;
GRANT ALL ON sites TO chan_archiver;

View File

@ -36,7 +36,7 @@ import JSONParsing
import JSONSettings
import qualified JSONCommonTypes as JS
import qualified JSONPost as JSONPosts
import qualified DataClient as Client
import qualified Network.DataClient as Client
import qualified SitesType as Sites
import qualified BoardsType as Boards
import qualified ThreadType as Threads
@ -45,6 +45,8 @@ import qualified Common.PostsType as Posts
import qualified Hash as Hash
import qualified Data.WordUtil as Words
import Debug.Trace (trace, traceShowId)
newtype SettingsCLI = SettingsCLI
{ jsonFile :: FilePath
} deriving (Show, Data, Typeable)
@ -202,7 +204,11 @@ readPosts settings board thread = do
apiPostToPostKey :: Threads.Thread -> JSONPosts.Post -> Client.PostId
apiPostToPostKey thread post = Client.PostId (Threads.thread_id thread) (JSONPosts.no post)
apiPostToPostKey thread post =
Client.PostId
{ Client.thread_id = (Threads.thread_id thread)
, Client.board_post_id = (JSONPosts.no post)
}
-- Convert Post to DbPost
apiPostToArchivePost :: Int -> Threads.Thread -> JSONPosts.Post -> Posts.Post
@ -531,18 +537,20 @@ createNewPosts
-> [ (Threads.Thread, JSONPosts.Post, Client.PostId) ]
-> IO [ Posts.Post ]
createNewPosts settings tuples = do
existing_post_results <- Client.getPosts settings (map (\(_, _, c) -> c) tuples)
existing_post_results <- Client.getPosts settings $ map (\(_, _, c) -> c) tuples
existing_posts <- either handleError return existing_post_results
thread_max_local_idx_result <- Client.getThreadMaxLocalIdx settings thread_ids
thread_max_local_idxs <- either handleError return thread_max_local_idx_result
let existing_set :: Set (Int64, Int64) = Set.fromList (map (\x -> (Posts.thread_id x, Posts.board_post_id x)) existing_posts)
let to_insert_list :: [ (Threads.Thread, JSONPosts.Post, Client.PostId) ] =
sortBy (comparing $ \(_, _, p) -> Client.board_post_id p) $
determineNew tuples existing_set
newPosts tuples existing_set
-- Map of thread_id to the largest local_idx value (which would be the number of the last post in the thread)
let local_idx :: Map.Map Int64 Int = foldr (Map.unionWith max) Map.empty $
map (\x -> Map.singleton (Posts.thread_id x) (Posts.local_idx x)) existing_posts
let local_idx :: Map.Map Int64 Int = Map.fromList thread_max_local_idxs
let insert_posts = fst $ foldl' foldFn ([], local_idx) to_insert_list
@ -555,6 +563,7 @@ createNewPosts settings tuples = do
-- - need to compare posts already in the map with another post and keep the max local_idx ✓
-- to get the new local_idx, we must order the to_insert_list by board_post_id, and look up each entry ✓
print insert_posts
posts_result <- Client.postPosts settings insert_posts
new_posts <- either handleError return posts_result
return $ existing_posts ++ new_posts
@ -562,14 +571,17 @@ createNewPosts settings tuples = do
where
handleError err = print err >> exitFailure
determineNew :: [(Threads.Thread, JSONPosts.Post, Client.PostId)] -> Set (Int64, Int64) -> [(Threads.Thread, JSONPosts.Post, Client.PostId)]
determineNew ts existing_set = filter (\(_, _, c) -> Set.notMember (Client.thread_id c, Client.board_post_id c) existing_set) ts
thread_ids :: [ Int64 ]
thread_ids = Set.elems $ Set.fromList $ map (\(t, _, _) -> Threads.thread_id t) tuples
newPosts :: [(Threads.Thread, JSONPosts.Post, Client.PostId)] -> Set (Int64, Int64) -> [(Threads.Thread, JSONPosts.Post, Client.PostId)]
newPosts ts existing_set = filter (\(_, _, c) -> Set.notMember (Client.thread_id c, Client.board_post_id c) existing_set) ts
foldFn :: ([Posts.Post], Map.Map Int64 Int) -> (Threads.Thread, JSONPosts.Post, Client.PostId) -> ([Posts.Post], Map.Map Int64 Int)
foldFn (posts, idx_map) (t, p, c) =
case Map.lookup thread_id idx_map of
Nothing -> ((post 1) : posts, Map.insert thread_id 1 idx_map)
Just i -> ((post (i + 1)) : posts, Map.insert thread_id (i + 1) idx_map)
foldFn (posts, idx_map) (t, p, c) = trace "foldFn" $
case Map.lookup thread_id (trace ("map size: " ++ (show $ length idx_map)) idx_map) of
Nothing -> (traceShowId (post 1) : posts, trace ("inserting_1 " ++ show thread_id ++ ": 1")$ Map.insert thread_id 1 idx_map)
Just i -> (traceShowId (post (i + 1)) : posts, trace ("inserting_2 " ++ show thread_id ++ ": " ++ show (i + 1)) $ Map.insert thread_id (i + 1) idx_map)
where
post :: Int -> Posts.Post
@ -666,4 +678,6 @@ main = do
Nothing -> do
putStrLn "Error: Invalid JSON format."
exitFailure
Just settings -> processBackupDirectory settings
Just settings -> do
processBackupDirectory settings
putStrLn "Done!"

@ -1 +1 @@
Subproject commit aed106f40e24125d9057d3a9dda934a41c6d68a0
Subproject commit ffe952e22939b21f8576d79752c863a95c208e1e

View File

@ -1,7 +1,7 @@
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE DeriveAnyClass #-}
module DataClient
module Network.DataClient
( HttpError(..)
, PostId (..)
, get
@ -11,6 +11,7 @@ module DataClient
, post
, postBoards
, getThreads
, getThreadMaxLocalIdx
, postThreads
, getPosts
, postPosts
@ -43,14 +44,14 @@ import qualified ThreadType as Threads
import qualified Common.AttachmentType as Attachments
import qualified Common.PostsType as Posts
import Common.Network.HttpClient
import qualified Network.DataClientTypes as T
data PostId = PostId
{ board_post_id :: Int64
, thread_id :: Int64
{ thread_id :: Int64
, board_post_id :: Int64
} deriving (Show, Generic, ToJSON)
getSiteBoards :: T.JSONSettings -> Int -> IO (Either HttpError [ Boards.Board ])
getSiteBoards settings site_id_ = get settings path >>= return . eitherDecodeResponse
where
@ -116,6 +117,19 @@ getThreads settings board_id board_thread_ids =
ids :: String = intercalate "," $ map show board_thread_ids
getThreadMaxLocalIdx :: T.JSONSettings -> [ Int64 ] -> IO (Either HttpError [(Int64, Int)])
getThreadMaxLocalIdx settings thread_ids = do
result :: Either HttpError [ T.ThreadMaxIdx ] <- get settings path >>= return . eitherDecodeResponse
let results = result >>= \x -> return $ map (\t -> (T.thread_id t, T.max_idx t)) x
return results
where
path = "/posts?select=thread_id,max_idx:local_idx.max()&thread_id=in.(" ++ ids ++ ")"
ids :: String = intercalate "," $ map show thread_ids
-- | Splits a list into chunks of a given size.
chunkList :: Int -> [a] -> [[a]]
chunkList _ [] = []
@ -172,6 +186,7 @@ getPosts :: T.JSONSettings -> [ PostId ] -> IO (Either HttpError [Posts.Post])
getPosts settings xs = do
results <- forM (chunkList chunkSize xs) (getPostsChunk settings)
return $ combineResults results
where
chunkSize = 1000
@ -180,7 +195,7 @@ postPosts
:: T.JSONSettings
-> [ Posts.Post ]
-> IO (Either HttpError [ Posts.Post ])
postPosts settings posts = do
postPosts settings posts =
post settings "/posts" payload True >>= return . eitherDecodeResponse
where

View File

@ -0,0 +1,13 @@
{-# LANGUAGE DeriveAnyClass #-}
module Network.DataClientTypes where
import Data.Int (Int64)
import Data.Aeson (FromJSON)
import GHC.Generics
data ThreadMaxIdx = ThreadMaxIdx
{ thread_id :: Int64
, max_idx :: Int
} deriving (Show, Generic, FromJSON)