WIP: saving attachments

This commit is contained in:
towards-a-new-leftypol 2024-01-17 16:36:49 -05:00
parent 103bf86017
commit f07907df1b
5 changed files with 103 additions and 17 deletions

View File

@ -71,6 +71,7 @@ CREATE TABLE IF NOT EXISTS posts
, email text , email text
, body_search_index tsvector , body_search_index tsvector
, thread_id bigint NOT NULL , thread_id bigint NOT NULL
-- , TODO: embed
, CONSTRAINT unique_thread_board_id_constraint UNIQUE (thread_id, board_post_id) , CONSTRAINT unique_thread_board_id_constraint UNIQUE (thread_id, board_post_id)
, CONSTRAINT thread_fk FOREIGN KEY (thread_id) REFERENCES threads (thread_id) ON DELETE CASCADE , CONSTRAINT thread_fk FOREIGN KEY (thread_id) REFERENCES threads (thread_id) ON DELETE CASCADE
); );
@ -181,6 +182,8 @@ $$ LANGUAGE sql;
-- 1:21 for full db (nothing inserted) -- 1:21 for full db (nothing inserted)
/*
* Is this even needed?
CREATE OR REPLACE FUNCTION insert_attachments_and_return_ids( CREATE OR REPLACE FUNCTION insert_attachments_and_return_ids(
attachments_payload attachments[]) attachments_payload attachments[])
RETURNS TABLE (attachment_id bigint, post_id bigint, sha256_hash text) AS $$ RETURNS TABLE (attachment_id bigint, post_id bigint, sha256_hash text) AS $$
@ -209,6 +212,7 @@ SELECT * FROM inserted
UNION ALL UNION ALL
SELECT * FROM selected; SELECT * FROM selected;
$$ LANGUAGE sql; $$ LANGUAGE sql;
*/
CREATE OR REPLACE FUNCTION fetch_top_threads( CREATE OR REPLACE FUNCTION fetch_top_threads(

View File

@ -5,15 +5,16 @@ module AttachmentType
import GHC.Generics import GHC.Generics
import Data.Int (Int64) import Data.Int (Int64)
import Data.Aeson (FromJSON) import Data.Aeson (FromJSON, ToJSON)
import Data.Text (Text) import Data.Text (Text)
import Data.Time.Clock (UTCTime)
data Attachment = Attachment data Attachment = Attachment
{ attachment_id :: Maybe Int64 { attachment_id :: Maybe Int64
, mimetype :: Text , mimetype :: Text
, creation_time :: UTCTime , creation_time :: UTCTime
, sha256_hash :: Int , sha256_hash :: Text
, phash :: Int64 , phash :: Int64
, phash :: Bool , illegal :: Bool
, post_id :: Int64 , post_id :: Int64
} deriving (Show, Generic, FromJSON) } deriving (Show, Generic, FromJSON, ToJSON)

View File

@ -1,9 +1,12 @@
{-# OPTIONS_GHC -Wno-unrecognised-pragmas #-} {-# OPTIONS_GHC -Wno-unrecognised-pragmas #-}
{-# HLINT ignore "Redundant bracket" #-} {-# HLINT ignore "Redundant bracket" #-}
{-# HLINT ignore "Use fromMaybe" #-} {-# HLINT ignore "Use fromMaybe" #-}
{-# LANGUAGE OverloadedStrings #-}
module Main where module Main where
import System.Exit import System.Exit
import Data.Int (Int64)
import Control.Monad (filterM) import Control.Monad (filterM)
import Data.Aeson (decode) import Data.Aeson (decode)
import qualified Data.ByteString.Lazy as B import qualified Data.ByteString.Lazy as B
@ -15,15 +18,19 @@ import qualified Data.Set as Set
import Data.Set (Set) import Data.Set (Set)
import Data.Time.Clock.POSIX (posixSecondsToUTCTime) import Data.Time.Clock.POSIX (posixSecondsToUTCTime)
import Data.Time.Clock (UTCTime) import Data.Time.Clock (UTCTime)
import qualified Data.Map as Map
import Data.Maybe (fromJust)
import JSONParsing import JSONParsing
import JSONSettings import JSONSettings
import qualified JSONCommonTypes as JS
import qualified JSONPost as JSONPosts import qualified JSONPost as JSONPosts
import qualified DataClient as Client import qualified DataClient as Client
import qualified SitesType as Sites import qualified SitesType as Sites
import qualified BoardsType as Boards import qualified BoardsType as Boards
import qualified ThreadType as Threads import qualified ThreadType as Threads
import qualified Common.PostsType as Posts import qualified AttachmentType as Attachments
import qualified Common.PostsType as Posts
newtype SettingsCLI = SettingsCLI newtype SettingsCLI = SettingsCLI
{ jsonFile :: FilePath { jsonFile :: FilePath
@ -181,14 +188,6 @@ readPosts settings board thread = do
backupDir = backup_read_root settings </> Boards.pathpart board backupDir = backup_read_root settings </> Boards.pathpart board
ensurePosts
:: JSONSettings
-> Boards.Board
-> [(Threads.Thread, [ Posts.Post ])]
-> IO [(Threads.Thread, [ Posts.Post ])]
ensurePosts = undefined
-- Convert Post to DbPost -- Convert Post to DbPost
apiPostToArchivePost :: Threads.Thread -> JSONPosts.Post -> Posts.Post apiPostToArchivePost :: Threads.Thread -> JSONPosts.Post -> Posts.Post
apiPostToArchivePost thread post = apiPostToArchivePost thread post =
@ -218,6 +217,34 @@ concatMapM op = foldr f (pure [])
pure $ x_ ++ xs_ pure $ x_ ++ xs_
setPostIdInPosts :: [ (JSONPosts.Post, Posts.Post) ] -> [ Client.PostId ] -> [ (JSONPosts.Post, Posts.Post) ]
setPostIdInPosts post_pairs ids = map f ids
where
post_map :: Map.Map (Int64, Int64) (JSONPosts.Post, Posts.Post)
post_map = Map.fromList (map (\(i, j) -> ((Posts.thread_id j, Posts.board_post_id j), (i, j))) post_pairs)
f :: Client.PostId -> (JSONPosts.Post, Posts.Post)
f (Client.PostId { Client.post_id = asdf1, Client.thread_id = asdf2, Client.board_post_id = asdf3 }) =
(\(i, j) -> (i, j { Posts.post_id = Just asdf1 })) (post_map Map.! (asdf2, asdf3))
fileToAttachment :: Posts.Post -> JS.File -> IO Attachments.Attachment
fileToAttachment post file = do
-- sha :: Text <- undefined
return Attachments.Attachment
{ Attachments.attachment_id = Nothing
, Attachments.mimetype = "undefined/undefined"
, Attachments.creation_time = Posts.creation_time post
, Attachments.sha256_hash = undefined
, Attachments.phash = undefined -- oh shit? we need a network request for this
-- but here we don't want to make a network request for every file we get for every post.
-- - probably most of them will already be in the database!
, Attachments.illegal = False
, Attachments.post_id = fromJust $ Posts.post_id post
}
processBoard :: JSONSettings -> Boards.Board -> IO () processBoard :: JSONSettings -> Boards.Board -> IO ()
processBoard settings board = do processBoard settings board = do
let catalogPath = backupDir </> "catalog.json" let catalogPath = backupDir </> "catalog.json"
@ -233,14 +260,28 @@ processBoard settings board = do
all_posts_on_board :: [(Threads.Thread, [ JSONPosts.Post ])] <- mapM (readPosts settings board) all_threads_for_board all_posts_on_board :: [(Threads.Thread, [ JSONPosts.Post ])] <- mapM (readPosts settings board) all_threads_for_board
-- putStrLn $ "Number of posts on /" ++ (Boards.pathpart board) ++ "/ " ++ (show $ length all_posts_on_board) let postPairs :: [ (JSONPosts.Post, Posts.Post) ] = concatMap
posts_result <- Client.postPosts settings (concatMap (\(t, posts) -> map (apiPostToArchivePost t) posts) all_posts_on_board) ( \(t, posts) -> map (\p -> (p, apiPostToArchivePost t p)) posts )
all_posts_on_board
-- putStrLn $ "Number of posts on /" ++ (Boards.pathpart board) ++ "/ " ++ (show $ length all_posts_on_board)
posts_result <- Client.postPosts settings (map snd postPairs)
-- TODO: why doesn't it insert posts for threads that already exist? we can have new posts! -- TODO: why doesn't it insert posts for threads that already exist? we can have new posts!
case posts_result of case posts_result of
Left err -> print err Left err -> print err
Right new_ids -> do Right (new_ids :: [ Client.PostId ]) -> do
let perfectPostPairs = setPostIdInPosts postPairs new_ids
-- must call
-- Client.postAttachments settings (all_attachments_on_board :: [ Attachments.Attachment ])
-- so we need [ Attachments.Attachment ]
-- JSONCommonTypes.File are properties of JSONPost.Post records.
-- - So need f :: Posts.Post -> JSONCommonTypes.File -> IO [ Attachments.Attachment ]
-- - f also does the sha256 calculation
--
-- need all JSONPost.Posts
putStrLn "Sum of post_ids:" putStrLn "Sum of post_ids:"
print $ sum $ map Client.post_id new_ids print $ sum $ map Client.post_id new_ids
putStrLn "Sum of board_post_ids:" putStrLn "Sum of board_post_ids:"
@ -295,6 +336,14 @@ processBackupDirectory settings = do
mapM_ (processBoard settings) boards_we_have_data_for mapM_ (processBoard settings) boards_we_have_data_for
-- TODO: detect saged threads by reading the bump time from the thread and comparing
-- that time to the timestamp of the most recent post. If the post is newer
-- - then the thread is being saged. Reasons it can be saged:
-- - it's saged by a mod
-- - the post has sage in the email field
-- - the thread is full.
main :: IO () main :: IO ()
main = do main = do
settingsValue <- cmdArgs $ SettingsCLI "backfill_settings.json" settingsValue <- cmdArgs $ SettingsCLI "backfill_settings.json"

View File

@ -13,6 +13,7 @@ module DataClient
, getThreads , getThreads
, postThreads , postThreads
, postPosts , postPosts
, getAttachments
) where ) where
import Data.Int (Int64) import Data.Int (Int64)
@ -37,6 +38,7 @@ import qualified JSONSettings as T
import qualified SitesType as Sites import qualified SitesType as Sites
import qualified BoardsType as Boards import qualified BoardsType as Boards
import qualified ThreadType as Threads import qualified ThreadType as Threads
import qualified AttachmentType as Attachments
import qualified Common.PostsType as Posts import qualified Common.PostsType as Posts
data HttpError data HttpError
@ -50,6 +52,14 @@ data PostId = PostId
, thread_id :: Int64 , thread_id :: Int64
} deriving (Show, Generic, FromJSON) } deriving (Show, Generic, FromJSON)
{-
data AttachmentId = AttachmentId
{ attachment_id :: Int64
, post_id_ :: Int64
, sha256_hash :: Text
} deriving (Show, Generic, FromJSON)
-}
get :: T.JSONSettings -> String -> IO (Either HttpError LBS.ByteString) get :: T.JSONSettings -> String -> IO (Either HttpError LBS.ByteString)
get settings path = do get settings path = do
let requestUrl = T.postgrest_url settings ++ path let requestUrl = T.postgrest_url settings ++ path
@ -162,6 +172,16 @@ getThreads settings board_id board_thread_ids =
path = "/threads?board_thread_id=in.(" ++ ids ++ ")&board_id=eq." ++ show board_id path = "/threads?board_thread_id=in.(" ++ ids ++ ")&board_id=eq." ++ show board_id
ids :: String = intercalate "," $ map show board_thread_ids ids :: String = intercalate "," $ map show board_thread_ids
getAttachments :: T.JSONSettings -> [ Int64 ] -> IO (Either HttpError [ Attachments.Attachment ])
getAttachments settings post_ids =
get settings path >>= return . eitherDecodeResponse
where
path :: String = "/attachments?post_id=in.(" ++ hashes ++ ")"
hashes :: String = intercalate "," $ (map show post_ids)
postPosts postPosts
:: T.JSONSettings :: T.JSONSettings
-> [ Posts.Post ] -> [ Posts.Post ]
@ -172,6 +192,17 @@ postPosts settings posts =
where where
payload = encode $ object [ "new_posts" .= posts ] payload = encode $ object [ "new_posts" .= posts ]
{-
postAttachments
:: T.JSONSettings
-> [ Attachments.Attachment ]
-> IO (Either HttpError [ AttachmentId ])
postAttachments settings attachments =
post settings "/rpc/insert_attachments_and_return_ids" payload True >>= return . eitherDecodeResponse
where
payload = encode $ object [ "attachments_payload" .= attachments ]
-}
eitherDecodeResponse :: (FromJSON a) => Either HttpError LBS.ByteString -> Either HttpError a eitherDecodeResponse :: (FromJSON a) => Either HttpError LBS.ByteString -> Either HttpError a
eitherDecodeResponse (Left err) = Left err eitherDecodeResponse (Left err) = Left err

View File

@ -7,9 +7,10 @@ module ThreadType
import GHC.Generics import GHC.Generics
import Data.Aeson (FromJSON) import Data.Aeson (FromJSON)
import Data.Time.Clock (UTCTime) -- Required for timestamp with time zone import Data.Time.Clock (UTCTime) -- Required for timestamp with time zone
import Data.Int (Int64)
data Thread = Thread data Thread = Thread
{ thread_id :: Int { thread_id :: Int64
, board_thread_id :: Int , board_thread_id :: Int
, creation_time :: UTCTime , creation_time :: UTCTime
, board_id :: Int , board_id :: Int