Additional checks to find non-existant files

- runs (may have bugs)
This commit is contained in:
towards-a-new-leftypol 2024-01-21 21:58:27 -05:00
parent 29c8f4eedb
commit 7265bc871b
4 changed files with 106 additions and 41 deletions

View File

@ -360,6 +360,7 @@ GRANT usage, select ON SEQUENCE sites_site_id_seq TO chan_archiver;
GRANT usage, select ON SEQUENCE boards_board_id_seq TO chan_archiver; GRANT usage, select ON SEQUENCE boards_board_id_seq TO chan_archiver;
GRANT usage, select ON SEQUENCE threads_thread_id_seq TO chan_archiver; GRANT usage, select ON SEQUENCE threads_thread_id_seq TO chan_archiver;
GRANT usage, select ON SEQUENCE posts_post_id_seq TO chan_archiver; GRANT usage, select ON SEQUENCE posts_post_id_seq TO chan_archiver;
GRANT usage, select ON SEQUENCE attachments_attachment_id_seq TO chan_archiver;
GRANT chan_archiver TO admin; GRANT chan_archiver TO admin;

View File

@ -17,16 +17,15 @@ data Dimension = Dimension
} deriving (Show, Generic, FromJSON, ToJSON) } deriving (Show, Generic, FromJSON, ToJSON)
data Paths = Paths data Paths = Paths
{ file_path :: Text { file_path :: FilePath
, thumbnail_path :: Text , thumbnail_path :: FilePath
} } deriving (Show)
data Attachment = Attachment data Attachment = Attachment
{ attachment_id :: Maybe Int64 { mimetype :: Text
, mimetype :: Text
, creation_time :: UTCTime , creation_time :: UTCTime
, sha256_hash :: Text , sha256_hash :: Text
, phash :: Int64 , phash :: Maybe Int64
, illegal :: Bool , illegal :: Bool
, post_id :: Int64 , post_id :: Int64
, resolution :: Maybe Dimension , resolution :: Maybe Dimension

View File

@ -22,6 +22,7 @@ import qualified Data.Map as Map
import Data.Maybe (fromJust) import Data.Maybe (fromJust)
import Data.Text (Text, unpack) import Data.Text (Text, unpack)
import Data.Text.Encoding (decodeUtf8) import Data.Text.Encoding (decodeUtf8)
import Data.List (isSuffixOf)
import Network.Mime (defaultMimeLookup, defaultMimeType) import Network.Mime (defaultMimeLookup, defaultMimeType)
import PerceptualHash (fileHash) import PerceptualHash (fileHash)
@ -237,11 +238,10 @@ setPostIdInPosts post_pairs ids = map f ids
fileToAttachment :: Posts.Post -> JS.File -> At.Attachment fileToAttachment :: Posts.Post -> JS.File -> At.Attachment
fileToAttachment post file = fileToAttachment post file =
At.Attachment At.Attachment
{ At.attachment_id = Nothing { At.mimetype = maybe "undefined/undefined" id (JS.mime file)
, At.mimetype = maybe "undefined/undefined" id (JS.mime file)
, At.creation_time = Posts.creation_time post , At.creation_time = Posts.creation_time post
, At.sha256_hash = undefined , At.sha256_hash = undefined
, At.phash = undefined , At.phash = Nothing
, At.illegal = False , At.illegal = False
, At.post_id = fromJust $ Posts.post_id post , At.post_id = fromJust $ Posts.post_id post
, At.resolution = dim , At.resolution = dim
@ -257,6 +257,14 @@ getMimeType :: Text -> Text
getMimeType ext = decodeUtf8 $ defaultMimeLookup ext getMimeType ext = decodeUtf8 $ defaultMimeLookup ext
phash_mimetypes :: Set.Set Text
phash_mimetypes = Set.fromList
[ "image/jpeg"
, "image/png"
, "image/gif"
]
processFiles :: JSONSettings -> [(JSONPosts.Post, Posts.Post)] -> IO () processFiles :: JSONSettings -> [(JSONPosts.Post, Posts.Post)] -> IO ()
processFiles settings post_pairs = do -- perfect just means that our posts have ids, they're already inserted into the db processFiles settings post_pairs = do -- perfect just means that our posts have ids, they're already inserted into the db
existing_attachments_result <- Client.getAttachments settings (map (fromJust . Posts.post_id . snd) post_pairs) existing_attachments_result <- Client.getAttachments settings (map (fromJust . Posts.post_id . snd) post_pairs)
@ -269,51 +277,73 @@ processFiles settings post_pairs = do -- perfect just means that our posts have
let map_existing :: Map.Map Int64 [ At.Attachment ] = let map_existing :: Map.Map Int64 [ At.Attachment ] =
foldl (insertRecord At.post_id) Map.empty existing_attachments foldl (insertRecord At.post_id) Map.empty existing_attachments
-- have things like sha256 already
-- how do we know that a `elem` attachments_on_board and a `elem` existing_attachments
-- can group existing_attachments into `map_existing :: Map post_id -> Set Attachment`
-- same with attachments_on_board into `map_should_exist :: Map post_id -> Set Attachment`
--
-- then run through the keys and compare size of the sets
let attachments_on_board :: [(At.Paths, At.Attachment)] = let attachments_on_board :: [(At.Paths, At.Attachment)] =
concatMap parseAttachments post_pairs concatMap parseAttachments post_pairs
let map_should_exist :: Map.Map Int64 [(At.Paths, At.Attachment)] = let map_should_exist :: Map.Map Int64 [(At.Paths, At.Attachment)] =
foldl (insertRecord (At.post_id . snd)) Map.empty attachments_on_board foldl (insertRecord (At.post_id . snd)) Map.empty attachments_on_board
-- basically we have a map of the post_id to attachments for that post_id
-- compare the number of attachments under each post_id in the map
-- assembled from the API with the map assembled from data in the db.
-- if the number of files differ, then likely there are new attachments.
-- This isn't fool-proof (there are probably weird cases where the
-- number of files for a post is the same but the files themselves
-- are different) but it's a quicker way of seeing which attachments
-- need to be saved than finding the sha256 hash of every single existing
-- file.
let to_insert_map = Map.filterWithKey (compareAttMaps map_existing) map_should_exist let to_insert_map = Map.filterWithKey (compareAttMaps map_existing) map_should_exist
let to_insert = foldr (++) [] $ Map.elems to_insert_map let to_insert = foldr (++) [] $ Map.elems to_insert_map
have_hash <- mapM computeAttachmentHash to_insert to_insert_exist <- filterM attachmentFileExists to_insert
with_hashes <- mapM computeAttachmentHash to_insert_exist
let existing_hashes :: Set.Set Text = let existing_hashes :: Set.Set Text =
Set.fromList $ map At.sha256_hash existing_attachments Set.fromList $ map At.sha256_hash existing_attachments
let to_insert_ = filter ((`Set.notMember` existing_hashes) . At.sha256_hash) have_hash let to_insert_ = filter ((`Set.notMember` existing_hashes) . At.sha256_hash) with_hashes
_ <- Client.postAttachments settings to_insert_ attachments_result <- Client.postAttachments settings to_insert_
-- TODO: Concat all values in to_insert_map and case attachments_result of
-- go ahead and compute sha256 and phashes on them. Left err -> do
-- ensure that the sha256 in to_insert_map is not in map_existing putStrLn $ "Error posting attachments: " ++ show err
exitFailure
return () Right saved -> do
putStrLn $ "Saved " ++ (show $ length saved) ++ " attachments!"
where where
attachmentFileExists :: (At.Paths, At.Attachment) -> IO Bool
attachmentFileExists (p, _) = doesFileExist (At.file_path p)
computeAttachmentHash :: (At.Paths, At.Attachment) -> IO At.Attachment computeAttachmentHash :: (At.Paths, At.Attachment) -> IO At.Attachment
computeAttachmentHash (p, q) = do computeAttachmentHash (p, q) = do
let f = backup_read_root settings <> "/" <> unpack (At.file_path p) let f = At.file_path p
putStrLn $ "Reading " ++ f
-- putStrLn $ show p
-- putStrLn $ show (q { At.sha256_hash = "undefined" })
sha256_sum <- Hash.computeSHA256 f sha256_sum <- Hash.computeSHA256 f
either_phash <- fileHash f putStrLn $ "SHA-256: " ++ unpack sha256_sum
phash :: Int64 <- case either_phash of phash :: Maybe Int64 <-
case (At.mimetype q) `Set.member` phash_mimetypes of
True -> do
either_phash <- fileHash f
case either_phash of
Left err_str -> do Left err_str -> do
putStrLn $ "Failed to compute phash for file " ++ (unpack sha256_sum) ++ " " ++ f ++ " " ++ err_str putStrLn $ "Failed to compute phash for file " ++ (unpack sha256_sum) ++ " " ++ f ++ " " ++ err_str
return (-1) return Nothing
Right phash_w -> return $ Words.wordToSignedInt64 phash_w Right phash_w -> do
putStrLn $ "phash: " ++ show phash_w
return $ Just $ Words.wordToSignedInt64 phash_w
False -> return Nothing
return q return q
@ -329,16 +359,26 @@ processFiles settings post_pairs = do -- perfect just means that our posts have
let board = JSONPosts.board post let board = JSONPosts.board post
let file_path = board <> "/src/" <> tim <> ext let file_path = withPathPrefix $ board <> "/src/" <> tim <> ext
let thumbnail_path = board <> "/thumb/" <> tim <> ext let thumbnail_path = withPathPrefix $ board <> "/thumb/" <> tim <> ext
Just $ At.Paths file_path thumbnail_path Just $ At.Paths file_path thumbnail_path
notDeleted :: (At.Paths, At.Attachment) -> Bool
notDeleted (p, _) = not $ "deleted" `isSuffixOf` (At.file_path p)
withPathPrefix :: Text -> FilePath
withPathPrefix = ((<>) $ backup_read_root settings) . unpack
parseAttachments :: (JSONPosts.Post, Posts.Post) -> [(At.Paths, At.Attachment)] parseAttachments :: (JSONPosts.Post, Posts.Post) -> [(At.Paths, At.Attachment)]
parseAttachments (p, q) = parseAttachments (p, q) = filter notDeleted $
case JSONPosts.files p of case JSONPosts.files p of
Just files -> map (\x -> (At.Paths (JS.file_path x) (JS.thumb_path x), fileToAttachment q x)) files Just files -> map (\x ->
( At.Paths (withPathPrefix $ JS.file_path x) (withPathPrefix $ JS.thumb_path x)
, fileToAttachment q x)
) files
Nothing -> Nothing ->
case parseLegacyPaths p of case parseLegacyPaths p of
Nothing -> [] Nothing -> []
@ -349,11 +389,10 @@ processFiles settings post_pairs = do -- perfect just means that our posts have
in in
( paths ( paths
, At.Attachment , At.Attachment
{ At.attachment_id = Nothing { At.mimetype = mime
, At.mimetype = mime
, At.creation_time = Posts.creation_time q , At.creation_time = Posts.creation_time q
, At.sha256_hash = undefined , At.sha256_hash = undefined
, At.phash = undefined , At.phash = Nothing
, At.illegal = False , At.illegal = False
, At.post_id = fromJust $ Posts.post_id q , At.post_id = fromJust $ Posts.post_id q
, At.resolution = dim , At.resolution = dim

View File

@ -17,13 +17,16 @@ module DataClient
, postAttachments , postAttachments
) where ) where
import Control.Monad (forM)
import Data.Int (Int64) import Data.Int (Int64)
import Data.Either (lefts, rights)
import Network.HTTP.Simple hiding (httpLbs) import Network.HTTP.Simple hiding (httpLbs)
import Network.HTTP.Client import Network.HTTP.Client
( newManager ( newManager
, managerSetMaxHeaderLength , managerSetMaxHeaderLength
, httpLbs , httpLbs
) )
import qualified Data.ByteString.Lazy.Char8 as BL
import Network.HTTP.Client.Conduit (defaultManagerSettings) import Network.HTTP.Client.Conduit (defaultManagerSettings)
import qualified Data.ByteString.Lazy as LBS import qualified Data.ByteString.Lazy as LBS
import qualified Data.ByteString.Lazy.Char8 as LC8 import qualified Data.ByteString.Lazy.Char8 as LC8
@ -185,20 +188,43 @@ getThreads settings board_id board_thread_ids =
ids :: String = intercalate "," $ map show board_thread_ids ids :: String = intercalate "," $ map show board_thread_ids
getAttachments :: T.JSONSettings -> [ Int64 ] -> IO (Either HttpError [ Attachments.Attachment ]) -- | Splits a list into chunks of a given size.
getAttachments settings post_ids = chunkList :: Int -> [a] -> [[a]]
chunkList _ [] = []
chunkList n xs = let (chunk, rest) = splitAt n xs in chunk : chunkList n rest
getAttachments :: T.JSONSettings -> [Int64] -> IO (Either HttpError [Attachments.Attachment])
getAttachments settings post_ids = do
results <- forM (chunkList chunkSize post_ids) (getAttachmentsChunk settings)
return $ combineResults results
where
chunkSize = 1000
-- | Combines the results, prioritizing errors.
combineResults :: [Either e [b]] -> Either e [b]
combineResults results =
case lefts results of
[] -> Right (concat (rights results))
(err:_) -> Left err
-- | Function to handle each chunk.
getAttachmentsChunk :: T.JSONSettings -> [Int64] -> IO (Either HttpError [Attachments.Attachment])
getAttachmentsChunk settings chunk =
get settings path >>= return . eitherDecodeResponse get settings path >>= return . eitherDecodeResponse
where where
path :: String = "/attachments?post_id=in.(" ++ hashes ++ ")" path = "/attachments?post_id=in.(" ++ intercalate "," (map show chunk) ++ ")"
hashes :: String = intercalate "," $ (map show post_ids)
postAttachments postAttachments
:: T.JSONSettings :: T.JSONSettings
-> [ Attachments.Attachment ] -> [ Attachments.Attachment ]
-> IO (Either HttpError [ Attachments.Attachment ]) -> IO (Either HttpError [ Attachments.Attachment ])
postAttachments settings attachments = postAttachments settings attachments = do
BL.putStrLn payload
post settings "/attachments" payload True >>= return . eitherDecodeResponse post settings "/attachments" payload True >>= return . eitherDecodeResponse
where where