WIP: saving attachments

This commit is contained in:
towards-a-new-leftypol 2024-01-17 16:36:49 -05:00
parent 103bf86017
commit f07907df1b
5 changed files with 103 additions and 17 deletions

View File

@ -71,6 +71,7 @@ CREATE TABLE IF NOT EXISTS posts
, email text
, body_search_index tsvector
, thread_id bigint NOT NULL
-- , TODO: embed
, CONSTRAINT unique_thread_board_id_constraint UNIQUE (thread_id, board_post_id)
, CONSTRAINT thread_fk FOREIGN KEY (thread_id) REFERENCES threads (thread_id) ON DELETE CASCADE
);
@ -181,6 +182,8 @@ $$ LANGUAGE sql;
-- 1:21 for full db (nothing inserted)
/*
* Is this even needed?
CREATE OR REPLACE FUNCTION insert_attachments_and_return_ids(
attachments_payload attachments[])
RETURNS TABLE (attachment_id bigint, post_id bigint, sha256_hash text) AS $$
@ -209,6 +212,7 @@ SELECT * FROM inserted
UNION ALL
SELECT * FROM selected;
$$ LANGUAGE sql;
*/
CREATE OR REPLACE FUNCTION fetch_top_threads(

View File

@ -5,15 +5,16 @@ module AttachmentType
import GHC.Generics
import Data.Int (Int64)
import Data.Aeson (FromJSON)
import Data.Aeson (FromJSON, ToJSON)
import Data.Text (Text)
import Data.Time.Clock (UTCTime)
data Attachment = Attachment
{ attachment_id :: Maybe Int64
, mimetype :: Text
, creation_time :: UTCTime
, sha256_hash :: Int
, sha256_hash :: Text
, phash :: Int64
, phash :: Bool
, illegal :: Bool
, post_id :: Int64
} deriving (Show, Generic, FromJSON)
} deriving (Show, Generic, FromJSON, ToJSON)

View File

@ -1,9 +1,12 @@
{-# OPTIONS_GHC -Wno-unrecognised-pragmas #-}
{-# HLINT ignore "Redundant bracket" #-}
{-# HLINT ignore "Use fromMaybe" #-}
{-# LANGUAGE OverloadedStrings #-}
module Main where
import System.Exit
import Data.Int (Int64)
import Control.Monad (filterM)
import Data.Aeson (decode)
import qualified Data.ByteString.Lazy as B
@ -15,15 +18,19 @@ import qualified Data.Set as Set
import Data.Set (Set)
import Data.Time.Clock.POSIX (posixSecondsToUTCTime)
import Data.Time.Clock (UTCTime)
import qualified Data.Map as Map
import Data.Maybe (fromJust)
import JSONParsing
import JSONSettings
import qualified JSONCommonTypes as JS
import qualified JSONPost as JSONPosts
import qualified DataClient as Client
import qualified SitesType as Sites
import qualified BoardsType as Boards
import qualified ThreadType as Threads
import qualified Common.PostsType as Posts
import qualified AttachmentType as Attachments
import qualified Common.PostsType as Posts
newtype SettingsCLI = SettingsCLI
{ jsonFile :: FilePath
@ -181,14 +188,6 @@ readPosts settings board thread = do
backupDir = backup_read_root settings </> Boards.pathpart board
ensurePosts
:: JSONSettings
-> Boards.Board
-> [(Threads.Thread, [ Posts.Post ])]
-> IO [(Threads.Thread, [ Posts.Post ])]
ensurePosts = undefined
-- Convert Post to DbPost
apiPostToArchivePost :: Threads.Thread -> JSONPosts.Post -> Posts.Post
apiPostToArchivePost thread post =
@ -218,6 +217,34 @@ concatMapM op = foldr f (pure [])
pure $ x_ ++ xs_
setPostIdInPosts :: [ (JSONPosts.Post, Posts.Post) ] -> [ Client.PostId ] -> [ (JSONPosts.Post, Posts.Post) ]
setPostIdInPosts post_pairs ids = map f ids
where
post_map :: Map.Map (Int64, Int64) (JSONPosts.Post, Posts.Post)
post_map = Map.fromList (map (\(i, j) -> ((Posts.thread_id j, Posts.board_post_id j), (i, j))) post_pairs)
f :: Client.PostId -> (JSONPosts.Post, Posts.Post)
f (Client.PostId { Client.post_id = asdf1, Client.thread_id = asdf2, Client.board_post_id = asdf3 }) =
(\(i, j) -> (i, j { Posts.post_id = Just asdf1 })) (post_map Map.! (asdf2, asdf3))
fileToAttachment :: Posts.Post -> JS.File -> IO Attachments.Attachment
fileToAttachment post file = do
-- sha :: Text <- undefined
return Attachments.Attachment
{ Attachments.attachment_id = Nothing
, Attachments.mimetype = "undefined/undefined"
, Attachments.creation_time = Posts.creation_time post
, Attachments.sha256_hash = undefined
, Attachments.phash = undefined -- oh shit? we need a network request for this
-- but here we don't want to make a network request for every file we get for every post.
-- - probably most of them will already be in the database!
, Attachments.illegal = False
, Attachments.post_id = fromJust $ Posts.post_id post
}
processBoard :: JSONSettings -> Boards.Board -> IO ()
processBoard settings board = do
let catalogPath = backupDir </> "catalog.json"
@ -233,14 +260,28 @@ processBoard settings board = do
all_posts_on_board :: [(Threads.Thread, [ JSONPosts.Post ])] <- mapM (readPosts settings board) all_threads_for_board
-- putStrLn $ "Number of posts on /" ++ (Boards.pathpart board) ++ "/ " ++ (show $ length all_posts_on_board)
posts_result <- Client.postPosts settings (concatMap (\(t, posts) -> map (apiPostToArchivePost t) posts) all_posts_on_board)
let postPairs :: [ (JSONPosts.Post, Posts.Post) ] = concatMap
( \(t, posts) -> map (\p -> (p, apiPostToArchivePost t p)) posts )
all_posts_on_board
-- putStrLn $ "Number of posts on /" ++ (Boards.pathpart board) ++ "/ " ++ (show $ length all_posts_on_board)
posts_result <- Client.postPosts settings (map snd postPairs)
-- TODO: why doesn't it insert posts for threads that already exist? we can have new posts!
case posts_result of
Left err -> print err
Right new_ids -> do
Right (new_ids :: [ Client.PostId ]) -> do
let perfectPostPairs = setPostIdInPosts postPairs new_ids
-- must call
-- Client.postAttachments settings (all_attachments_on_board :: [ Attachments.Attachment ])
-- so we need [ Attachments.Attachment ]
-- JSONCommonTypes.File are properties of JSONPost.Post records.
-- - So need f :: Posts.Post -> JSONCommonTypes.File -> IO [ Attachments.Attachment ]
-- - f also does the sha256 calculation
--
-- need all JSONPost.Posts
putStrLn "Sum of post_ids:"
print $ sum $ map Client.post_id new_ids
putStrLn "Sum of board_post_ids:"
@ -295,6 +336,14 @@ processBackupDirectory settings = do
mapM_ (processBoard settings) boards_we_have_data_for
-- TODO: detect saged threads by reading the bump time from the thread and comparing
-- that time to the timestamp of the most recent post. If the post is newer
-- - then the thread is being saged. Reasons it can be saged:
-- - it's saged by a mod
-- - the post has sage in the email field
-- - the thread is full.
main :: IO ()
main = do
settingsValue <- cmdArgs $ SettingsCLI "backfill_settings.json"

View File

@ -13,6 +13,7 @@ module DataClient
, getThreads
, postThreads
, postPosts
, getAttachments
) where
import Data.Int (Int64)
@ -37,6 +38,7 @@ import qualified JSONSettings as T
import qualified SitesType as Sites
import qualified BoardsType as Boards
import qualified ThreadType as Threads
import qualified AttachmentType as Attachments
import qualified Common.PostsType as Posts
data HttpError
@ -50,6 +52,14 @@ data PostId = PostId
, thread_id :: Int64
} deriving (Show, Generic, FromJSON)
{-
data AttachmentId = AttachmentId
{ attachment_id :: Int64
, post_id_ :: Int64
, sha256_hash :: Text
} deriving (Show, Generic, FromJSON)
-}
get :: T.JSONSettings -> String -> IO (Either HttpError LBS.ByteString)
get settings path = do
let requestUrl = T.postgrest_url settings ++ path
@ -162,6 +172,16 @@ getThreads settings board_id board_thread_ids =
path = "/threads?board_thread_id=in.(" ++ ids ++ ")&board_id=eq." ++ show board_id
ids :: String = intercalate "," $ map show board_thread_ids
getAttachments :: T.JSONSettings -> [ Int64 ] -> IO (Either HttpError [ Attachments.Attachment ])
getAttachments settings post_ids =
get settings path >>= return . eitherDecodeResponse
where
path :: String = "/attachments?post_id=in.(" ++ hashes ++ ")"
hashes :: String = intercalate "," $ (map show post_ids)
postPosts
:: T.JSONSettings
-> [ Posts.Post ]
@ -172,6 +192,17 @@ postPosts settings posts =
where
payload = encode $ object [ "new_posts" .= posts ]
{-
postAttachments
:: T.JSONSettings
-> [ Attachments.Attachment ]
-> IO (Either HttpError [ AttachmentId ])
postAttachments settings attachments =
post settings "/rpc/insert_attachments_and_return_ids" payload True >>= return . eitherDecodeResponse
where
payload = encode $ object [ "attachments_payload" .= attachments ]
-}
eitherDecodeResponse :: (FromJSON a) => Either HttpError LBS.ByteString -> Either HttpError a
eitherDecodeResponse (Left err) = Left err

View File

@ -7,9 +7,10 @@ module ThreadType
import GHC.Generics
import Data.Aeson (FromJSON)
import Data.Time.Clock (UTCTime) -- Required for timestamp with time zone
import Data.Int (Int64)
data Thread = Thread
{ thread_id :: Int
{ thread_id :: Int64
, board_thread_id :: Int
, creation_time :: UTCTime
, board_id :: Int