diff --git a/chan-delorean.cabal b/chan-delorean.cabal index cf94ad9..b1cae0b 100644 --- a/chan-delorean.cabal +++ b/chan-delorean.cabal @@ -68,12 +68,13 @@ executable chan-delorean other-modules: JSONParsing DataClient - Types + JSONSettings SitesType BoardsType ThreadType JSONPost JSONCommonTypes + PostsType -- LANGUAGE extensions used by modules in this package. -- other-extensions: diff --git a/shell.nix b/shell.nix index c20881c..998b42c 100644 --- a/shell.nix +++ b/shell.nix @@ -9,7 +9,7 @@ let }: mkDerivation { pname = "chan-delorean"; - version = "0.0.0.0"; + version = "0.0.2"; src = ./.; isLibrary = false; isExecutable = true; diff --git a/sql/initialize.sql b/sql/initialize.sql index 787b8c8..b07a684 100644 --- a/sql/initialize.sql +++ b/sql/initialize.sql @@ -110,6 +110,69 @@ CREATE INDEX attachments_post_id_idx ON attachments (post_id); CREATE INDEX attachments_md5_hash_idx ON attachments (md5_hash); CREATE INDEX attachments_phash_bktree_index ON attachments USING spgist (phash bktree_ops); + +/* + * Function Definitions + */ + +/* +CREATE OR REPLACE FUNCTION insert_posts_and_return_ids(new_posts posts[]) +RETURNS TABLE (post_id bigint, board_post_id bigint) AS $$ +WITH inserted AS ( + INSERT INTO posts (board_post_id, creation_time, body, thread_id) + SELECT np.board_post_id, np.creation_time, np.body, np.thread_id + FROM unnest(new_posts) AS np + ON CONFLICT (thread_id, board_post_id) DO NOTHING + RETURNING post_id, board_post_id +), +selected AS ( + SELECT post_id, board_post_id + FROM posts + WHERE (thread_id, board_post_id) IN (SELECT thread_id, board_post_id FROM unnest(new_posts)) +) +SELECT * FROM inserted +UNION ALL +SELECT * FROM selected WHERE (post_id, board_post_id) NOT IN (SELECT post_id, board_post_id FROM inserted); +$$ LANGUAGE sql; + +-- 3m37s for clean db +-- 1m34s for full db (nothing inserted) + +*/ + +CREATE OR REPLACE FUNCTION insert_posts_and_return_ids(new_posts posts[]) +RETURNS TABLE (post_id bigint, board_post_id bigint, thread_id bigint) AS $$ +WITH +selected AS ( + SELECT post_id, board_post_id, thread_id + FROM posts + WHERE (thread_id, board_post_id) IN (SELECT thread_id, board_post_id FROM unnest(new_posts)) +), +to_insert AS ( + SELECT np.* + FROM unnest(new_posts) AS np + LEFT OUTER JOIN selected s ON np.thread_id = s.thread_id AND np.board_post_id = s.board_post_id + WHERE s.post_id IS NULL +), +inserted AS ( + INSERT INTO posts (board_post_id, creation_time, body, thread_id) + SELECT board_post_id, creation_time, body, thread_id + FROM to_insert + RETURNING post_id, board_post_id, thread_id +) +SELECT * FROM inserted +UNION ALL +SELECT * FROM selected; +$$ LANGUAGE sql; + +-- 1:51 for clean db (this varies a lot) +-- 1:21 for full db (nothing inserted) + + +/* + * Permissions + */ + CREATE ROLE chan_archive_anon nologin; GRANT CONNECT ON DATABASE chan_archives TO chan_archive_anon; GRANT SELECT ON sites TO chan_archive_anon; @@ -130,9 +193,12 @@ GRANT ALL ON threads TO chan_archiver; GRANT ALL ON posts TO chan_archiver; GRANT ALL ON attachments TO chan_archiver; GRANT EXECUTE ON FUNCTION update_post_body_search_index TO chan_archiver; -GRANT usage, select ON SEQUENCE sites_site_id_seq TO chan_archiver; -GRANT usage, select ON SEQUENCE boards_board_id_seq TO chan_archiver; -GRANT usage, select ON SEQUENCE threads_thread_id_seq TO chan_archiver; +GRANT EXECUTE ON FUNCTION insert_posts_and_return_ids TO chan_archiver; +GRANT usage, select ON SEQUENCE sites_site_id_seq TO chan_archiver; +GRANT usage, select ON SEQUENCE boards_board_id_seq TO chan_archiver; +GRANT usage, select ON SEQUENCE threads_thread_id_seq TO chan_archiver; +GRANT usage, select ON SEQUENCE posts_post_id_seq TO chan_archiver; + GRANT chan_archiver TO admin; COMMIT; diff --git a/src/Backfill.hs b/src/Backfill.hs index 4d24a75..ae01168 100644 --- a/src/Backfill.hs +++ b/src/Backfill.hs @@ -12,14 +12,14 @@ import qualified Data.Set as Set import Data.Time.Clock.POSIX (posixSecondsToUTCTime) import Data.Time.Clock (UTCTime) - import JSONParsing -import Types +import JSONSettings +import qualified JSONPost as JSONPosts import qualified DataClient as Client -import qualified SitesType as Sites +import qualified SitesType as Sites import qualified BoardsType as Boards import qualified ThreadType as Threads -import qualified JSONPost as JSONPosts +import qualified PostsType as Posts data SettingsCLI = SettingsCLI { jsonFile :: FilePath @@ -28,7 +28,7 @@ data SettingsCLI = SettingsCLI settingsCLI :: SettingsCLI settingsCLI = SettingsCLI { jsonFile = def &= args &= typ "settings-jsonfile-path" - } &= summary "Backfill v0.0.1" + } &= summary "Backfill v0.0.2" listCatalogDirectories :: JSONSettings -> IO [FilePath] @@ -160,7 +160,12 @@ ensureThreads settings board all_threads = do new_threads <- createArchivesForNewThreads settings all_threads archived_threads board return $ archived_threads ++ new_threads -readPosts :: JSONSettings -> Boards.Board -> Threads.Thread -> IO [ JSONPosts.Post ] + +readPosts + :: JSONSettings + -> Boards.Board + -> Threads.Thread + -> IO (Threads.Thread, [ JSONPosts.Post ]) readPosts settings board thread = do result <- parsePosts thread_filename @@ -168,16 +173,50 @@ readPosts settings board thread = do Left err -> do putStrLn $ "Failed to parse the JSON file " ++ thread_filename ++ " error: " ++ err exitFailure - Right posts_wrapper -> return $ JSONPosts.posts posts_wrapper + Right posts_wrapper -> return $ (thread, JSONPosts.posts posts_wrapper) where - thread_filename :: FilePath + thread_filename :: FilePath thread_filename = backupDir "res" ((show $ Threads.board_thread_id thread) ++ ".json") backupDir :: FilePath backupDir = backup_read_root settings (Boards.pathpart board) +ensurePosts + :: JSONSettings + -> Boards.Board + -> [(Threads.Thread, [ Posts.Post ])] + -> IO [(Threads.Thread, [ Posts.Post ])] +ensurePosts = undefined + + +-- Convert Post to DbPost +apiPostToArchivePost :: Threads.Thread -> JSONPosts.Post -> Posts.Post +apiPostToArchivePost thread post = + Posts.Post + { Posts.post_id = Nothing + , Posts.board_post_id = JSONPosts.no post + , Posts.creation_time = posixSecondsToUTCTime (realToFrac $ JSONPosts.time post) + , Posts.body = JSONPosts.com post + , Posts.thread_id = Threads.thread_id thread + } + +-- | A version of 'concatMap' that works with a monadic predicate. +-- Stolen from package extra Control.Monad.Extra +concatMapM :: Monad m => (a -> m [b]) -> [a] -> m [b] +{-# INLINE concatMapM #-} +concatMapM op = foldr f (pure []) + where f x xs = do + x_ <- op x + + if null x_ + then xs + else do + xs_ <- xs + pure $ x_ ++ xs_ + + processBoard :: JSONSettings -> Boards.Board -> IO () processBoard settings board = do let catalogPath = backupDir "catalog.json" @@ -190,6 +229,26 @@ processBoard settings board = do let threads_on_board = concatMap threads catalogs all_threads_for_board :: [ Threads.Thread ] <- ensureThreads settings board threads_on_board + + all_posts_on_board :: [(Threads.Thread, [ JSONPosts.Post ])] <- mapM (readPosts settings board) all_threads_for_board + + -- putStrLn $ "Number of posts on /" ++ (Boards.pathpart board) ++ "/ " ++ (show $ length all_posts_on_board) + posts_result <- Client.postPosts settings (concatMap (\(t, posts) -> map (apiPostToArchivePost t) posts) all_posts_on_board) + + -- TODO: why doesn't it insert posts for threads that already exist? we can have new posts! + + case posts_result of + Left err -> print err + Right new_ids -> do + putStrLn "Sum of post_ids:" + print $ sum $ map Client.post_id new_ids + putStrLn "Sum of board_post_ids:" + print $ sum $ map Client.board_post_id new_ids + + -- max: 18,645 + -- min: 147 + -- total: 191,628 + -- -- f :: Threads.Thread -> [ Posts.Post ] -- for each thread we have to call a function that -- - reads the thread under the board directory: diff --git a/src/DataClient.hs b/src/DataClient.hs index 63f8daf..0a3a429 100644 --- a/src/DataClient.hs +++ b/src/DataClient.hs @@ -1,7 +1,9 @@ {-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE DeriveAnyClass #-} module DataClient ( HttpError(..) + , PostId (..) , get , getSiteBoards , getAllSites @@ -10,8 +12,10 @@ module DataClient , postBoards , getThreads , postThreads + , postPosts ) where +import Data.Int (Int64) import Network.HTTP.Simple import qualified Data.ByteString.Lazy as LBS import qualified Data.ByteString.Lazy.Char8 as LC8 @@ -27,17 +31,24 @@ import Data.Aeson , encode , Value ) +import GHC.Generics -import qualified Types as T +import qualified JSONSettings as T import qualified SitesType as Sites import qualified BoardsType as Boards import qualified ThreadType as Threads +import qualified PostsType as Posts data HttpError = HttpException SomeException | StatusCodeError Int LBS.ByteString deriving (Show) +data PostId = PostId + { post_id :: Int64 + , board_post_id :: Int64 + , thread_id :: Int64 + } deriving (Show, Generic, FromJSON) get :: T.JSONSettings -> String -> IO (Either HttpError LBS.ByteString) get settings path = do @@ -151,6 +162,17 @@ getThreads settings board_id board_thread_ids = path = "/threads?board_thread_id=in.(" ++ ids ++ ")&board_id=eq." ++ show board_id ids :: String = intercalate "," $ map show board_thread_ids +postPosts + :: T.JSONSettings + -> [ Posts.Post ] + -> IO (Either HttpError [ PostId ]) +postPosts settings posts = + post settings "/rpc/insert_posts_and_return_ids" payload True >>= return . eitherDecodeResponse + + where + payload = encode $ object [ "new_posts" .= posts ] + + eitherDecodeResponse :: (FromJSON a) => Either HttpError LBS.ByteString -> Either HttpError a eitherDecodeResponse (Left err) = Left err eitherDecodeResponse (Right bs) = diff --git a/src/JSONCommonTypes.hs b/src/JSONCommonTypes.hs index 383dab3..832d4b3 100644 --- a/src/JSONCommonTypes.hs +++ b/src/JSONCommonTypes.hs @@ -1,14 +1,29 @@ module JSONCommonTypes ( File (..) + , Cyclical (..) ) where import Data.Text (Text) -import Data.Aeson (FromJSON) +import qualified Data.Text as T +import Data.Aeson +import Data.Aeson.Types (typeMismatch) import GHC.Generics +data Cyclical = Cyclical Int deriving (Show, Generic) + +instance FromJSON Cyclical where + parseJSON (Number n) = return $ Cyclical (floor n) + parseJSON (String s) = + case reads (T.unpack s) :: [(Int, String)] of + [(n, "")] -> return $ Cyclical n + _ -> typeMismatch "Int or String containing Int" (String s) + + parseJSON invalid = typeMismatch "Int or String" invalid + + data File = File { id :: Text - , mime :: Text + , mime :: Maybe Text , ext :: Text , h :: Maybe Int , w :: Maybe Int diff --git a/src/JSONParsing.hs b/src/JSONParsing.hs index 92dbd60..9c24587 100644 --- a/src/JSONParsing.hs +++ b/src/JSONParsing.hs @@ -9,26 +9,13 @@ module JSONParsing {-# LANGUAGE OverloadedStrings #-} import Data.Text (Text) -import Data.Aeson import GHC.Generics import qualified Data.ByteString.Lazy as B -import qualified Data.Text as T -import Data.Aeson.Types (typeMismatch) +import Data.Aeson import qualified JSONPost as Post import qualified JSONCommonTypes as J -data Cyclical = Cyclical Int deriving (Show, Generic) - -instance FromJSON Cyclical where - parseJSON (Number n) = return $ Cyclical (floor n) - parseJSON (String s) = - case reads (T.unpack s) :: [(Int, String)] of - [(n, "")] -> return $ Cyclical n - _ -> typeMismatch "Int or String containing Int" (String s) - - parseJSON invalid = typeMismatch "Int or String" invalid - data Thread = Thread { no :: Int , sub :: Maybe Text @@ -42,7 +29,7 @@ data Thread = Thread , images :: Maybe Int , sticky :: Maybe Int , locked :: Maybe Int - , cyclical :: Maybe Cyclical + , cyclical :: Maybe J.Cyclical , last_modified :: Int , board :: Text , files :: Maybe [J.File] diff --git a/src/JSONPost.hs b/src/JSONPost.hs index 9a25929..5179398 100644 --- a/src/JSONPost.hs +++ b/src/JSONPost.hs @@ -4,12 +4,13 @@ module JSONPost ) where import Data.Text (Text) +import Data.Int (Int64) import Data.Aeson (FromJSON) import GHC.Generics import qualified JSONCommonTypes as J data Post = Post - { no :: Int + { no :: Int64 , com :: Maybe Text , name :: Maybe Text , time :: Int @@ -17,7 +18,7 @@ data Post = Post , omitted_images :: Maybe Int , sticky :: Maybe Int , locked :: Maybe Int - , cyclical :: Maybe Int + , cyclical :: Maybe J.Cyclical , last_modified :: Int , board :: String , files :: Maybe [J.File] diff --git a/src/Types.hs b/src/JSONSettings.hs similarity index 93% rename from src/Types.hs rename to src/JSONSettings.hs index cf14b58..6e794db 100644 --- a/src/Types.hs +++ b/src/JSONSettings.hs @@ -1,4 +1,4 @@ -module Types +module JSONSettings ( JSONSettings(..) ) where diff --git a/src/PostsType.hs b/src/PostsType.hs new file mode 100644 index 0000000..bb6ff66 --- /dev/null +++ b/src/PostsType.hs @@ -0,0 +1,19 @@ +{-# LANGUAGE DeriveAnyClass #-} + +module PostsType + ( Post (..) ) + where + +import GHC.Generics +import Data.Aeson (FromJSON, ToJSON) +import Data.Time.Clock (UTCTime) -- Required for timestamp with time zone +import Data.Int (Int64) +import Data.Text (Text) + +data Post = Post + { post_id :: Maybe Int64 + , board_post_id :: Int64 + , creation_time :: UTCTime + , body :: Maybe Text + , thread_id :: Int + } deriving (Show, Generic, FromJSON, ToJSON)