diff --git a/sql/initialize.sql b/sql/initialize.sql index 71f0ba2..dd58cc7 100644 --- a/sql/initialize.sql +++ b/sql/initialize.sql @@ -71,6 +71,7 @@ CREATE TABLE IF NOT EXISTS posts , email text , body_search_index tsvector , thread_id bigint NOT NULL +-- , TODO: embed , CONSTRAINT unique_thread_board_id_constraint UNIQUE (thread_id, board_post_id) , CONSTRAINT thread_fk FOREIGN KEY (thread_id) REFERENCES threads (thread_id) ON DELETE CASCADE ); @@ -181,6 +182,8 @@ $$ LANGUAGE sql; -- 1:21 for full db (nothing inserted) +/* + * Is this even needed? CREATE OR REPLACE FUNCTION insert_attachments_and_return_ids( attachments_payload attachments[]) RETURNS TABLE (attachment_id bigint, post_id bigint, sha256_hash text) AS $$ @@ -209,6 +212,7 @@ SELECT * FROM inserted UNION ALL SELECT * FROM selected; $$ LANGUAGE sql; +*/ CREATE OR REPLACE FUNCTION fetch_top_threads( diff --git a/src/AttachmentType.hs b/src/AttachmentType.hs index 6ef5c74..a20e1d2 100644 --- a/src/AttachmentType.hs +++ b/src/AttachmentType.hs @@ -5,15 +5,16 @@ module AttachmentType import GHC.Generics import Data.Int (Int64) -import Data.Aeson (FromJSON) +import Data.Aeson (FromJSON, ToJSON) import Data.Text (Text) +import Data.Time.Clock (UTCTime) data Attachment = Attachment { attachment_id :: Maybe Int64 , mimetype :: Text , creation_time :: UTCTime - , sha256_hash :: Int + , sha256_hash :: Text , phash :: Int64 - , phash :: Bool + , illegal :: Bool , post_id :: Int64 - } deriving (Show, Generic, FromJSON) + } deriving (Show, Generic, FromJSON, ToJSON) diff --git a/src/Backfill.hs b/src/Backfill.hs index 550dcb5..f166bf6 100644 --- a/src/Backfill.hs +++ b/src/Backfill.hs @@ -1,9 +1,12 @@ {-# OPTIONS_GHC -Wno-unrecognised-pragmas #-} {-# HLINT ignore "Redundant bracket" #-} {-# HLINT ignore "Use fromMaybe" #-} +{-# LANGUAGE OverloadedStrings #-} + module Main where import System.Exit +import Data.Int (Int64) import Control.Monad (filterM) import Data.Aeson (decode) import qualified Data.ByteString.Lazy as B @@ -15,15 +18,19 @@ import qualified Data.Set as Set import Data.Set (Set) import Data.Time.Clock.POSIX (posixSecondsToUTCTime) import Data.Time.Clock (UTCTime) +import qualified Data.Map as Map +import Data.Maybe (fromJust) import JSONParsing import JSONSettings +import qualified JSONCommonTypes as JS import qualified JSONPost as JSONPosts import qualified DataClient as Client import qualified SitesType as Sites import qualified BoardsType as Boards import qualified ThreadType as Threads -import qualified Common.PostsType as Posts +import qualified AttachmentType as Attachments +import qualified Common.PostsType as Posts newtype SettingsCLI = SettingsCLI { jsonFile :: FilePath @@ -181,14 +188,6 @@ readPosts settings board thread = do backupDir = backup_read_root settings Boards.pathpart board -ensurePosts - :: JSONSettings - -> Boards.Board - -> [(Threads.Thread, [ Posts.Post ])] - -> IO [(Threads.Thread, [ Posts.Post ])] -ensurePosts = undefined - - -- Convert Post to DbPost apiPostToArchivePost :: Threads.Thread -> JSONPosts.Post -> Posts.Post apiPostToArchivePost thread post = @@ -218,6 +217,34 @@ concatMapM op = foldr f (pure []) pure $ x_ ++ xs_ +setPostIdInPosts :: [ (JSONPosts.Post, Posts.Post) ] -> [ Client.PostId ] -> [ (JSONPosts.Post, Posts.Post) ] +setPostIdInPosts post_pairs ids = map f ids + where + post_map :: Map.Map (Int64, Int64) (JSONPosts.Post, Posts.Post) + post_map = Map.fromList (map (\(i, j) -> ((Posts.thread_id j, Posts.board_post_id j), (i, j))) post_pairs) + + f :: Client.PostId -> (JSONPosts.Post, Posts.Post) + f (Client.PostId { Client.post_id = asdf1, Client.thread_id = asdf2, Client.board_post_id = asdf3 }) = + (\(i, j) -> (i, j { Posts.post_id = Just asdf1 })) (post_map Map.! (asdf2, asdf3)) + + +fileToAttachment :: Posts.Post -> JS.File -> IO Attachments.Attachment +fileToAttachment post file = do + -- sha :: Text <- undefined + + return Attachments.Attachment + { Attachments.attachment_id = Nothing + , Attachments.mimetype = "undefined/undefined" + , Attachments.creation_time = Posts.creation_time post + , Attachments.sha256_hash = undefined + , Attachments.phash = undefined -- oh shit? we need a network request for this + -- but here we don't want to make a network request for every file we get for every post. + -- - probably most of them will already be in the database! + , Attachments.illegal = False + , Attachments.post_id = fromJust $ Posts.post_id post + } + + processBoard :: JSONSettings -> Boards.Board -> IO () processBoard settings board = do let catalogPath = backupDir "catalog.json" @@ -233,14 +260,28 @@ processBoard settings board = do all_posts_on_board :: [(Threads.Thread, [ JSONPosts.Post ])] <- mapM (readPosts settings board) all_threads_for_board - -- putStrLn $ "Number of posts on /" ++ (Boards.pathpart board) ++ "/ " ++ (show $ length all_posts_on_board) - posts_result <- Client.postPosts settings (concatMap (\(t, posts) -> map (apiPostToArchivePost t) posts) all_posts_on_board) + let postPairs :: [ (JSONPosts.Post, Posts.Post) ] = concatMap + ( \(t, posts) -> map (\p -> (p, apiPostToArchivePost t p)) posts ) + all_posts_on_board + -- putStrLn $ "Number of posts on /" ++ (Boards.pathpart board) ++ "/ " ++ (show $ length all_posts_on_board) + posts_result <- Client.postPosts settings (map snd postPairs) -- TODO: why doesn't it insert posts for threads that already exist? we can have new posts! case posts_result of Left err -> print err - Right new_ids -> do + Right (new_ids :: [ Client.PostId ]) -> do + let perfectPostPairs = setPostIdInPosts postPairs new_ids + + -- must call + -- Client.postAttachments settings (all_attachments_on_board :: [ Attachments.Attachment ]) + -- so we need [ Attachments.Attachment ] + -- JSONCommonTypes.File are properties of JSONPost.Post records. + -- - So need f :: Posts.Post -> JSONCommonTypes.File -> IO [ Attachments.Attachment ] + -- - f also does the sha256 calculation + -- + -- need all JSONPost.Posts + putStrLn "Sum of post_ids:" print $ sum $ map Client.post_id new_ids putStrLn "Sum of board_post_ids:" @@ -295,6 +336,14 @@ processBackupDirectory settings = do mapM_ (processBoard settings) boards_we_have_data_for +-- TODO: detect saged threads by reading the bump time from the thread and comparing +-- that time to the timestamp of the most recent post. If the post is newer +-- - then the thread is being saged. Reasons it can be saged: +-- - it's saged by a mod +-- - the post has sage in the email field +-- - the thread is full. + + main :: IO () main = do settingsValue <- cmdArgs $ SettingsCLI "backfill_settings.json" diff --git a/src/DataClient.hs b/src/DataClient.hs index e193a5b..c5682c1 100644 --- a/src/DataClient.hs +++ b/src/DataClient.hs @@ -13,6 +13,7 @@ module DataClient , getThreads , postThreads , postPosts + , getAttachments ) where import Data.Int (Int64) @@ -37,6 +38,7 @@ import qualified JSONSettings as T import qualified SitesType as Sites import qualified BoardsType as Boards import qualified ThreadType as Threads +import qualified AttachmentType as Attachments import qualified Common.PostsType as Posts data HttpError @@ -50,6 +52,14 @@ data PostId = PostId , thread_id :: Int64 } deriving (Show, Generic, FromJSON) +{- +data AttachmentId = AttachmentId + { attachment_id :: Int64 + , post_id_ :: Int64 + , sha256_hash :: Text + } deriving (Show, Generic, FromJSON) +-} + get :: T.JSONSettings -> String -> IO (Either HttpError LBS.ByteString) get settings path = do let requestUrl = T.postgrest_url settings ++ path @@ -162,6 +172,16 @@ getThreads settings board_id board_thread_ids = path = "/threads?board_thread_id=in.(" ++ ids ++ ")&board_id=eq." ++ show board_id ids :: String = intercalate "," $ map show board_thread_ids + +getAttachments :: T.JSONSettings -> [ Int64 ] -> IO (Either HttpError [ Attachments.Attachment ]) +getAttachments settings post_ids = + get settings path >>= return . eitherDecodeResponse + + where + path :: String = "/attachments?post_id=in.(" ++ hashes ++ ")" + hashes :: String = intercalate "," $ (map show post_ids) + + postPosts :: T.JSONSettings -> [ Posts.Post ] @@ -172,6 +192,17 @@ postPosts settings posts = where payload = encode $ object [ "new_posts" .= posts ] +{- +postAttachments + :: T.JSONSettings + -> [ Attachments.Attachment ] + -> IO (Either HttpError [ AttachmentId ]) +postAttachments settings attachments = + post settings "/rpc/insert_attachments_and_return_ids" payload True >>= return . eitherDecodeResponse + + where + payload = encode $ object [ "attachments_payload" .= attachments ] +-} eitherDecodeResponse :: (FromJSON a) => Either HttpError LBS.ByteString -> Either HttpError a eitherDecodeResponse (Left err) = Left err diff --git a/src/ThreadType.hs b/src/ThreadType.hs index 75379cc..588cfd4 100644 --- a/src/ThreadType.hs +++ b/src/ThreadType.hs @@ -7,9 +7,10 @@ module ThreadType import GHC.Generics import Data.Aeson (FromJSON) import Data.Time.Clock (UTCTime) -- Required for timestamp with time zone +import Data.Int (Int64) data Thread = Thread - { thread_id :: Int + { thread_id :: Int64 , board_thread_id :: Int , creation_time :: UTCTime , board_id :: Int