From 7265bc871b35933eba8b14ee4c9045353aee79d3 Mon Sep 17 00:00:00 2001 From: towards-a-new-leftypol Date: Sun, 21 Jan 2024 21:58:27 -0500 Subject: [PATCH] Additional checks to find non-existant files - runs (may have bugs) --- sql/initialize.sql | 1 + src/AttachmentType.hs | 11 +++-- src/Backfill.hs | 99 ++++++++++++++++++++++++++++++------------- src/DataClient.hs | 36 +++++++++++++--- 4 files changed, 106 insertions(+), 41 deletions(-) diff --git a/sql/initialize.sql b/sql/initialize.sql index 4ce7529..0f9fffb 100644 --- a/sql/initialize.sql +++ b/sql/initialize.sql @@ -360,6 +360,7 @@ GRANT usage, select ON SEQUENCE sites_site_id_seq TO chan_archiver; GRANT usage, select ON SEQUENCE boards_board_id_seq TO chan_archiver; GRANT usage, select ON SEQUENCE threads_thread_id_seq TO chan_archiver; GRANT usage, select ON SEQUENCE posts_post_id_seq TO chan_archiver; +GRANT usage, select ON SEQUENCE attachments_attachment_id_seq TO chan_archiver; GRANT chan_archiver TO admin; diff --git a/src/AttachmentType.hs b/src/AttachmentType.hs index 48683ea..cfb4e2e 100644 --- a/src/AttachmentType.hs +++ b/src/AttachmentType.hs @@ -17,16 +17,15 @@ data Dimension = Dimension } deriving (Show, Generic, FromJSON, ToJSON) data Paths = Paths - { file_path :: Text - , thumbnail_path :: Text - } + { file_path :: FilePath + , thumbnail_path :: FilePath + } deriving (Show) data Attachment = Attachment - { attachment_id :: Maybe Int64 - , mimetype :: Text + { mimetype :: Text , creation_time :: UTCTime , sha256_hash :: Text - , phash :: Int64 + , phash :: Maybe Int64 , illegal :: Bool , post_id :: Int64 , resolution :: Maybe Dimension diff --git a/src/Backfill.hs b/src/Backfill.hs index 0e48a98..47e3f8b 100644 --- a/src/Backfill.hs +++ b/src/Backfill.hs @@ -22,6 +22,7 @@ import qualified Data.Map as Map import Data.Maybe (fromJust) import Data.Text (Text, unpack) import Data.Text.Encoding (decodeUtf8) +import Data.List (isSuffixOf) import Network.Mime (defaultMimeLookup, defaultMimeType) import PerceptualHash (fileHash) @@ -237,11 +238,10 @@ setPostIdInPosts post_pairs ids = map f ids fileToAttachment :: Posts.Post -> JS.File -> At.Attachment fileToAttachment post file = At.Attachment - { At.attachment_id = Nothing - , At.mimetype = maybe "undefined/undefined" id (JS.mime file) + { At.mimetype = maybe "undefined/undefined" id (JS.mime file) , At.creation_time = Posts.creation_time post , At.sha256_hash = undefined - , At.phash = undefined + , At.phash = Nothing , At.illegal = False , At.post_id = fromJust $ Posts.post_id post , At.resolution = dim @@ -257,6 +257,14 @@ getMimeType :: Text -> Text getMimeType ext = decodeUtf8 $ defaultMimeLookup ext +phash_mimetypes :: Set.Set Text +phash_mimetypes = Set.fromList + [ "image/jpeg" + , "image/png" + , "image/gif" + ] + + processFiles :: JSONSettings -> [(JSONPosts.Post, Posts.Post)] -> IO () processFiles settings post_pairs = do -- perfect just means that our posts have ids, they're already inserted into the db existing_attachments_result <- Client.getAttachments settings (map (fromJust . Posts.post_id . snd) post_pairs) @@ -269,51 +277,73 @@ processFiles settings post_pairs = do -- perfect just means that our posts have let map_existing :: Map.Map Int64 [ At.Attachment ] = foldl (insertRecord At.post_id) Map.empty existing_attachments - -- have things like sha256 already - -- how do we know that a `elem` attachments_on_board and a `elem` existing_attachments - -- can group existing_attachments into `map_existing :: Map post_id -> Set Attachment` - -- same with attachments_on_board into `map_should_exist :: Map post_id -> Set Attachment` - -- - -- then run through the keys and compare size of the sets let attachments_on_board :: [(At.Paths, At.Attachment)] = concatMap parseAttachments post_pairs let map_should_exist :: Map.Map Int64 [(At.Paths, At.Attachment)] = foldl (insertRecord (At.post_id . snd)) Map.empty attachments_on_board + -- basically we have a map of the post_id to attachments for that post_id + -- compare the number of attachments under each post_id in the map + -- assembled from the API with the map assembled from data in the db. + -- if the number of files differ, then likely there are new attachments. + -- This isn't fool-proof (there are probably weird cases where the + -- number of files for a post is the same but the files themselves + -- are different) but it's a quicker way of seeing which attachments + -- need to be saved than finding the sha256 hash of every single existing + -- file. let to_insert_map = Map.filterWithKey (compareAttMaps map_existing) map_should_exist let to_insert = foldr (++) [] $ Map.elems to_insert_map - have_hash <- mapM computeAttachmentHash to_insert + to_insert_exist <- filterM attachmentFileExists to_insert + + with_hashes <- mapM computeAttachmentHash to_insert_exist let existing_hashes :: Set.Set Text = Set.fromList $ map At.sha256_hash existing_attachments - let to_insert_ = filter ((`Set.notMember` existing_hashes) . At.sha256_hash) have_hash + let to_insert_ = filter ((`Set.notMember` existing_hashes) . At.sha256_hash) with_hashes - _ <- Client.postAttachments settings to_insert_ + attachments_result <- Client.postAttachments settings to_insert_ - -- TODO: Concat all values in to_insert_map and - -- go ahead and compute sha256 and phashes on them. - -- ensure that the sha256 in to_insert_map is not in map_existing + case attachments_result of + Left err -> do + putStrLn $ "Error posting attachments: " ++ show err + exitFailure - return () + Right saved -> do + putStrLn $ "Saved " ++ (show $ length saved) ++ " attachments!" where + attachmentFileExists :: (At.Paths, At.Attachment) -> IO Bool + attachmentFileExists (p, _) = doesFileExist (At.file_path p) + computeAttachmentHash :: (At.Paths, At.Attachment) -> IO At.Attachment computeAttachmentHash (p, q) = do - let f = backup_read_root settings <> "/" <> unpack (At.file_path p) + let f = At.file_path p + + putStrLn $ "Reading " ++ f + -- putStrLn $ show p + -- putStrLn $ show (q { At.sha256_hash = "undefined" }) sha256_sum <- Hash.computeSHA256 f - either_phash <- fileHash f + putStrLn $ "SHA-256: " ++ unpack sha256_sum - phash :: Int64 <- case either_phash of - Left err_str -> do - putStrLn $ "Failed to compute phash for file " ++ (unpack sha256_sum) ++ " " ++ f ++ " " ++ err_str - return (-1) - Right phash_w -> return $ Words.wordToSignedInt64 phash_w + phash :: Maybe Int64 <- + case (At.mimetype q) `Set.member` phash_mimetypes of + True -> do + either_phash <- fileHash f + case either_phash of + Left err_str -> do + putStrLn $ "Failed to compute phash for file " ++ (unpack sha256_sum) ++ " " ++ f ++ " " ++ err_str + return Nothing + Right phash_w -> do + putStrLn $ "phash: " ++ show phash_w + return $ Just $ Words.wordToSignedInt64 phash_w + + False -> return Nothing return q @@ -329,16 +359,26 @@ processFiles settings post_pairs = do -- perfect just means that our posts have let board = JSONPosts.board post - let file_path = board <> "/src/" <> tim <> ext - let thumbnail_path = board <> "/thumb/" <> tim <> ext + let file_path = withPathPrefix $ board <> "/src/" <> tim <> ext + let thumbnail_path = withPathPrefix $ board <> "/thumb/" <> tim <> ext Just $ At.Paths file_path thumbnail_path + notDeleted :: (At.Paths, At.Attachment) -> Bool + notDeleted (p, _) = not $ "deleted" `isSuffixOf` (At.file_path p) + + + withPathPrefix :: Text -> FilePath + withPathPrefix = ((<>) $ backup_read_root settings) . unpack + parseAttachments :: (JSONPosts.Post, Posts.Post) -> [(At.Paths, At.Attachment)] - parseAttachments (p, q) = + parseAttachments (p, q) = filter notDeleted $ case JSONPosts.files p of - Just files -> map (\x -> (At.Paths (JS.file_path x) (JS.thumb_path x), fileToAttachment q x)) files + Just files -> map (\x -> + ( At.Paths (withPathPrefix $ JS.file_path x) (withPathPrefix $ JS.thumb_path x) + , fileToAttachment q x) + ) files Nothing -> case parseLegacyPaths p of Nothing -> [] @@ -349,11 +389,10 @@ processFiles settings post_pairs = do -- perfect just means that our posts have in ( paths , At.Attachment - { At.attachment_id = Nothing - , At.mimetype = mime + { At.mimetype = mime , At.creation_time = Posts.creation_time q , At.sha256_hash = undefined - , At.phash = undefined + , At.phash = Nothing , At.illegal = False , At.post_id = fromJust $ Posts.post_id q , At.resolution = dim diff --git a/src/DataClient.hs b/src/DataClient.hs index 0ba11e5..fe91a44 100644 --- a/src/DataClient.hs +++ b/src/DataClient.hs @@ -17,13 +17,16 @@ module DataClient , postAttachments ) where +import Control.Monad (forM) import Data.Int (Int64) +import Data.Either (lefts, rights) import Network.HTTP.Simple hiding (httpLbs) import Network.HTTP.Client ( newManager , managerSetMaxHeaderLength , httpLbs ) +import qualified Data.ByteString.Lazy.Char8 as BL import Network.HTTP.Client.Conduit (defaultManagerSettings) import qualified Data.ByteString.Lazy as LBS import qualified Data.ByteString.Lazy.Char8 as LC8 @@ -185,20 +188,43 @@ getThreads settings board_id board_thread_ids = ids :: String = intercalate "," $ map show board_thread_ids -getAttachments :: T.JSONSettings -> [ Int64 ] -> IO (Either HttpError [ Attachments.Attachment ]) -getAttachments settings post_ids = +-- | Splits a list into chunks of a given size. +chunkList :: Int -> [a] -> [[a]] +chunkList _ [] = [] +chunkList n xs = let (chunk, rest) = splitAt n xs in chunk : chunkList n rest + + +getAttachments :: T.JSONSettings -> [Int64] -> IO (Either HttpError [Attachments.Attachment]) +getAttachments settings post_ids = do + results <- forM (chunkList chunkSize post_ids) (getAttachmentsChunk settings) + return $ combineResults results + where + chunkSize = 1000 + + +-- | Combines the results, prioritizing errors. +combineResults :: [Either e [b]] -> Either e [b] +combineResults results = + case lefts results of + [] -> Right (concat (rights results)) + (err:_) -> Left err + + +-- | Function to handle each chunk. +getAttachmentsChunk :: T.JSONSettings -> [Int64] -> IO (Either HttpError [Attachments.Attachment]) +getAttachmentsChunk settings chunk = get settings path >>= return . eitherDecodeResponse where - path :: String = "/attachments?post_id=in.(" ++ hashes ++ ")" - hashes :: String = intercalate "," $ (map show post_ids) + path = "/attachments?post_id=in.(" ++ intercalate "," (map show chunk) ++ ")" postAttachments :: T.JSONSettings -> [ Attachments.Attachment ] -> IO (Either HttpError [ Attachments.Attachment ]) -postAttachments settings attachments = +postAttachments settings attachments = do + BL.putStrLn payload post settings "/attachments" payload True >>= return . eitherDecodeResponse where