Additional checks to find non-existant files
- runs (may have bugs)
This commit is contained in:
parent
29c8f4eedb
commit
7265bc871b
|
@ -360,6 +360,7 @@ GRANT usage, select ON SEQUENCE sites_site_id_seq TO chan_archiver;
|
||||||
GRANT usage, select ON SEQUENCE boards_board_id_seq TO chan_archiver;
|
GRANT usage, select ON SEQUENCE boards_board_id_seq TO chan_archiver;
|
||||||
GRANT usage, select ON SEQUENCE threads_thread_id_seq TO chan_archiver;
|
GRANT usage, select ON SEQUENCE threads_thread_id_seq TO chan_archiver;
|
||||||
GRANT usage, select ON SEQUENCE posts_post_id_seq TO chan_archiver;
|
GRANT usage, select ON SEQUENCE posts_post_id_seq TO chan_archiver;
|
||||||
|
GRANT usage, select ON SEQUENCE attachments_attachment_id_seq TO chan_archiver;
|
||||||
|
|
||||||
GRANT chan_archiver TO admin;
|
GRANT chan_archiver TO admin;
|
||||||
|
|
||||||
|
|
|
@ -17,16 +17,15 @@ data Dimension = Dimension
|
||||||
} deriving (Show, Generic, FromJSON, ToJSON)
|
} deriving (Show, Generic, FromJSON, ToJSON)
|
||||||
|
|
||||||
data Paths = Paths
|
data Paths = Paths
|
||||||
{ file_path :: Text
|
{ file_path :: FilePath
|
||||||
, thumbnail_path :: Text
|
, thumbnail_path :: FilePath
|
||||||
}
|
} deriving (Show)
|
||||||
|
|
||||||
data Attachment = Attachment
|
data Attachment = Attachment
|
||||||
{ attachment_id :: Maybe Int64
|
{ mimetype :: Text
|
||||||
, mimetype :: Text
|
|
||||||
, creation_time :: UTCTime
|
, creation_time :: UTCTime
|
||||||
, sha256_hash :: Text
|
, sha256_hash :: Text
|
||||||
, phash :: Int64
|
, phash :: Maybe Int64
|
||||||
, illegal :: Bool
|
, illegal :: Bool
|
||||||
, post_id :: Int64
|
, post_id :: Int64
|
||||||
, resolution :: Maybe Dimension
|
, resolution :: Maybe Dimension
|
||||||
|
|
|
@ -22,6 +22,7 @@ import qualified Data.Map as Map
|
||||||
import Data.Maybe (fromJust)
|
import Data.Maybe (fromJust)
|
||||||
import Data.Text (Text, unpack)
|
import Data.Text (Text, unpack)
|
||||||
import Data.Text.Encoding (decodeUtf8)
|
import Data.Text.Encoding (decodeUtf8)
|
||||||
|
import Data.List (isSuffixOf)
|
||||||
import Network.Mime (defaultMimeLookup, defaultMimeType)
|
import Network.Mime (defaultMimeLookup, defaultMimeType)
|
||||||
import PerceptualHash (fileHash)
|
import PerceptualHash (fileHash)
|
||||||
|
|
||||||
|
@ -237,11 +238,10 @@ setPostIdInPosts post_pairs ids = map f ids
|
||||||
fileToAttachment :: Posts.Post -> JS.File -> At.Attachment
|
fileToAttachment :: Posts.Post -> JS.File -> At.Attachment
|
||||||
fileToAttachment post file =
|
fileToAttachment post file =
|
||||||
At.Attachment
|
At.Attachment
|
||||||
{ At.attachment_id = Nothing
|
{ At.mimetype = maybe "undefined/undefined" id (JS.mime file)
|
||||||
, At.mimetype = maybe "undefined/undefined" id (JS.mime file)
|
|
||||||
, At.creation_time = Posts.creation_time post
|
, At.creation_time = Posts.creation_time post
|
||||||
, At.sha256_hash = undefined
|
, At.sha256_hash = undefined
|
||||||
, At.phash = undefined
|
, At.phash = Nothing
|
||||||
, At.illegal = False
|
, At.illegal = False
|
||||||
, At.post_id = fromJust $ Posts.post_id post
|
, At.post_id = fromJust $ Posts.post_id post
|
||||||
, At.resolution = dim
|
, At.resolution = dim
|
||||||
|
@ -257,6 +257,14 @@ getMimeType :: Text -> Text
|
||||||
getMimeType ext = decodeUtf8 $ defaultMimeLookup ext
|
getMimeType ext = decodeUtf8 $ defaultMimeLookup ext
|
||||||
|
|
||||||
|
|
||||||
|
phash_mimetypes :: Set.Set Text
|
||||||
|
phash_mimetypes = Set.fromList
|
||||||
|
[ "image/jpeg"
|
||||||
|
, "image/png"
|
||||||
|
, "image/gif"
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
processFiles :: JSONSettings -> [(JSONPosts.Post, Posts.Post)] -> IO ()
|
processFiles :: JSONSettings -> [(JSONPosts.Post, Posts.Post)] -> IO ()
|
||||||
processFiles settings post_pairs = do -- perfect just means that our posts have ids, they're already inserted into the db
|
processFiles settings post_pairs = do -- perfect just means that our posts have ids, they're already inserted into the db
|
||||||
existing_attachments_result <- Client.getAttachments settings (map (fromJust . Posts.post_id . snd) post_pairs)
|
existing_attachments_result <- Client.getAttachments settings (map (fromJust . Posts.post_id . snd) post_pairs)
|
||||||
|
@ -269,51 +277,73 @@ processFiles settings post_pairs = do -- perfect just means that our posts have
|
||||||
let map_existing :: Map.Map Int64 [ At.Attachment ] =
|
let map_existing :: Map.Map Int64 [ At.Attachment ] =
|
||||||
foldl (insertRecord At.post_id) Map.empty existing_attachments
|
foldl (insertRecord At.post_id) Map.empty existing_attachments
|
||||||
|
|
||||||
-- have things like sha256 already
|
|
||||||
-- how do we know that a `elem` attachments_on_board and a `elem` existing_attachments
|
|
||||||
-- can group existing_attachments into `map_existing :: Map post_id -> Set Attachment`
|
|
||||||
-- same with attachments_on_board into `map_should_exist :: Map post_id -> Set Attachment`
|
|
||||||
--
|
|
||||||
-- then run through the keys and compare size of the sets
|
|
||||||
let attachments_on_board :: [(At.Paths, At.Attachment)] =
|
let attachments_on_board :: [(At.Paths, At.Attachment)] =
|
||||||
concatMap parseAttachments post_pairs
|
concatMap parseAttachments post_pairs
|
||||||
|
|
||||||
let map_should_exist :: Map.Map Int64 [(At.Paths, At.Attachment)] =
|
let map_should_exist :: Map.Map Int64 [(At.Paths, At.Attachment)] =
|
||||||
foldl (insertRecord (At.post_id . snd)) Map.empty attachments_on_board
|
foldl (insertRecord (At.post_id . snd)) Map.empty attachments_on_board
|
||||||
|
|
||||||
|
-- basically we have a map of the post_id to attachments for that post_id
|
||||||
|
-- compare the number of attachments under each post_id in the map
|
||||||
|
-- assembled from the API with the map assembled from data in the db.
|
||||||
|
-- if the number of files differ, then likely there are new attachments.
|
||||||
|
-- This isn't fool-proof (there are probably weird cases where the
|
||||||
|
-- number of files for a post is the same but the files themselves
|
||||||
|
-- are different) but it's a quicker way of seeing which attachments
|
||||||
|
-- need to be saved than finding the sha256 hash of every single existing
|
||||||
|
-- file.
|
||||||
let to_insert_map = Map.filterWithKey (compareAttMaps map_existing) map_should_exist
|
let to_insert_map = Map.filterWithKey (compareAttMaps map_existing) map_should_exist
|
||||||
|
|
||||||
let to_insert = foldr (++) [] $ Map.elems to_insert_map
|
let to_insert = foldr (++) [] $ Map.elems to_insert_map
|
||||||
|
|
||||||
have_hash <- mapM computeAttachmentHash to_insert
|
to_insert_exist <- filterM attachmentFileExists to_insert
|
||||||
|
|
||||||
|
with_hashes <- mapM computeAttachmentHash to_insert_exist
|
||||||
|
|
||||||
let existing_hashes :: Set.Set Text =
|
let existing_hashes :: Set.Set Text =
|
||||||
Set.fromList $ map At.sha256_hash existing_attachments
|
Set.fromList $ map At.sha256_hash existing_attachments
|
||||||
|
|
||||||
let to_insert_ = filter ((`Set.notMember` existing_hashes) . At.sha256_hash) have_hash
|
let to_insert_ = filter ((`Set.notMember` existing_hashes) . At.sha256_hash) with_hashes
|
||||||
|
|
||||||
_ <- Client.postAttachments settings to_insert_
|
attachments_result <- Client.postAttachments settings to_insert_
|
||||||
|
|
||||||
-- TODO: Concat all values in to_insert_map and
|
case attachments_result of
|
||||||
-- go ahead and compute sha256 and phashes on them.
|
Left err -> do
|
||||||
-- ensure that the sha256 in to_insert_map is not in map_existing
|
putStrLn $ "Error posting attachments: " ++ show err
|
||||||
|
exitFailure
|
||||||
|
|
||||||
return ()
|
Right saved -> do
|
||||||
|
putStrLn $ "Saved " ++ (show $ length saved) ++ " attachments!"
|
||||||
|
|
||||||
where
|
where
|
||||||
|
attachmentFileExists :: (At.Paths, At.Attachment) -> IO Bool
|
||||||
|
attachmentFileExists (p, _) = doesFileExist (At.file_path p)
|
||||||
|
|
||||||
computeAttachmentHash :: (At.Paths, At.Attachment) -> IO At.Attachment
|
computeAttachmentHash :: (At.Paths, At.Attachment) -> IO At.Attachment
|
||||||
computeAttachmentHash (p, q) = do
|
computeAttachmentHash (p, q) = do
|
||||||
let f = backup_read_root settings <> "/" <> unpack (At.file_path p)
|
let f = At.file_path p
|
||||||
|
|
||||||
|
putStrLn $ "Reading " ++ f
|
||||||
|
-- putStrLn $ show p
|
||||||
|
-- putStrLn $ show (q { At.sha256_hash = "undefined" })
|
||||||
|
|
||||||
sha256_sum <- Hash.computeSHA256 f
|
sha256_sum <- Hash.computeSHA256 f
|
||||||
|
|
||||||
either_phash <- fileHash f
|
putStrLn $ "SHA-256: " ++ unpack sha256_sum
|
||||||
|
|
||||||
phash :: Int64 <- case either_phash of
|
phash :: Maybe Int64 <-
|
||||||
Left err_str -> do
|
case (At.mimetype q) `Set.member` phash_mimetypes of
|
||||||
putStrLn $ "Failed to compute phash for file " ++ (unpack sha256_sum) ++ " " ++ f ++ " " ++ err_str
|
True -> do
|
||||||
return (-1)
|
either_phash <- fileHash f
|
||||||
Right phash_w -> return $ Words.wordToSignedInt64 phash_w
|
case either_phash of
|
||||||
|
Left err_str -> do
|
||||||
|
putStrLn $ "Failed to compute phash for file " ++ (unpack sha256_sum) ++ " " ++ f ++ " " ++ err_str
|
||||||
|
return Nothing
|
||||||
|
Right phash_w -> do
|
||||||
|
putStrLn $ "phash: " ++ show phash_w
|
||||||
|
return $ Just $ Words.wordToSignedInt64 phash_w
|
||||||
|
|
||||||
|
False -> return Nothing
|
||||||
|
|
||||||
|
|
||||||
return q
|
return q
|
||||||
|
@ -329,16 +359,26 @@ processFiles settings post_pairs = do -- perfect just means that our posts have
|
||||||
|
|
||||||
let board = JSONPosts.board post
|
let board = JSONPosts.board post
|
||||||
|
|
||||||
let file_path = board <> "/src/" <> tim <> ext
|
let file_path = withPathPrefix $ board <> "/src/" <> tim <> ext
|
||||||
let thumbnail_path = board <> "/thumb/" <> tim <> ext
|
let thumbnail_path = withPathPrefix $ board <> "/thumb/" <> tim <> ext
|
||||||
|
|
||||||
Just $ At.Paths file_path thumbnail_path
|
Just $ At.Paths file_path thumbnail_path
|
||||||
|
|
||||||
|
|
||||||
|
notDeleted :: (At.Paths, At.Attachment) -> Bool
|
||||||
|
notDeleted (p, _) = not $ "deleted" `isSuffixOf` (At.file_path p)
|
||||||
|
|
||||||
|
|
||||||
|
withPathPrefix :: Text -> FilePath
|
||||||
|
withPathPrefix = ((<>) $ backup_read_root settings) . unpack
|
||||||
|
|
||||||
parseAttachments :: (JSONPosts.Post, Posts.Post) -> [(At.Paths, At.Attachment)]
|
parseAttachments :: (JSONPosts.Post, Posts.Post) -> [(At.Paths, At.Attachment)]
|
||||||
parseAttachments (p, q) =
|
parseAttachments (p, q) = filter notDeleted $
|
||||||
case JSONPosts.files p of
|
case JSONPosts.files p of
|
||||||
Just files -> map (\x -> (At.Paths (JS.file_path x) (JS.thumb_path x), fileToAttachment q x)) files
|
Just files -> map (\x ->
|
||||||
|
( At.Paths (withPathPrefix $ JS.file_path x) (withPathPrefix $ JS.thumb_path x)
|
||||||
|
, fileToAttachment q x)
|
||||||
|
) files
|
||||||
Nothing ->
|
Nothing ->
|
||||||
case parseLegacyPaths p of
|
case parseLegacyPaths p of
|
||||||
Nothing -> []
|
Nothing -> []
|
||||||
|
@ -349,11 +389,10 @@ processFiles settings post_pairs = do -- perfect just means that our posts have
|
||||||
in
|
in
|
||||||
( paths
|
( paths
|
||||||
, At.Attachment
|
, At.Attachment
|
||||||
{ At.attachment_id = Nothing
|
{ At.mimetype = mime
|
||||||
, At.mimetype = mime
|
|
||||||
, At.creation_time = Posts.creation_time q
|
, At.creation_time = Posts.creation_time q
|
||||||
, At.sha256_hash = undefined
|
, At.sha256_hash = undefined
|
||||||
, At.phash = undefined
|
, At.phash = Nothing
|
||||||
, At.illegal = False
|
, At.illegal = False
|
||||||
, At.post_id = fromJust $ Posts.post_id q
|
, At.post_id = fromJust $ Posts.post_id q
|
||||||
, At.resolution = dim
|
, At.resolution = dim
|
||||||
|
|
|
@ -17,13 +17,16 @@ module DataClient
|
||||||
, postAttachments
|
, postAttachments
|
||||||
) where
|
) where
|
||||||
|
|
||||||
|
import Control.Monad (forM)
|
||||||
import Data.Int (Int64)
|
import Data.Int (Int64)
|
||||||
|
import Data.Either (lefts, rights)
|
||||||
import Network.HTTP.Simple hiding (httpLbs)
|
import Network.HTTP.Simple hiding (httpLbs)
|
||||||
import Network.HTTP.Client
|
import Network.HTTP.Client
|
||||||
( newManager
|
( newManager
|
||||||
, managerSetMaxHeaderLength
|
, managerSetMaxHeaderLength
|
||||||
, httpLbs
|
, httpLbs
|
||||||
)
|
)
|
||||||
|
import qualified Data.ByteString.Lazy.Char8 as BL
|
||||||
import Network.HTTP.Client.Conduit (defaultManagerSettings)
|
import Network.HTTP.Client.Conduit (defaultManagerSettings)
|
||||||
import qualified Data.ByteString.Lazy as LBS
|
import qualified Data.ByteString.Lazy as LBS
|
||||||
import qualified Data.ByteString.Lazy.Char8 as LC8
|
import qualified Data.ByteString.Lazy.Char8 as LC8
|
||||||
|
@ -185,20 +188,43 @@ getThreads settings board_id board_thread_ids =
|
||||||
ids :: String = intercalate "," $ map show board_thread_ids
|
ids :: String = intercalate "," $ map show board_thread_ids
|
||||||
|
|
||||||
|
|
||||||
getAttachments :: T.JSONSettings -> [ Int64 ] -> IO (Either HttpError [ Attachments.Attachment ])
|
-- | Splits a list into chunks of a given size.
|
||||||
getAttachments settings post_ids =
|
chunkList :: Int -> [a] -> [[a]]
|
||||||
|
chunkList _ [] = []
|
||||||
|
chunkList n xs = let (chunk, rest) = splitAt n xs in chunk : chunkList n rest
|
||||||
|
|
||||||
|
|
||||||
|
getAttachments :: T.JSONSettings -> [Int64] -> IO (Either HttpError [Attachments.Attachment])
|
||||||
|
getAttachments settings post_ids = do
|
||||||
|
results <- forM (chunkList chunkSize post_ids) (getAttachmentsChunk settings)
|
||||||
|
return $ combineResults results
|
||||||
|
where
|
||||||
|
chunkSize = 1000
|
||||||
|
|
||||||
|
|
||||||
|
-- | Combines the results, prioritizing errors.
|
||||||
|
combineResults :: [Either e [b]] -> Either e [b]
|
||||||
|
combineResults results =
|
||||||
|
case lefts results of
|
||||||
|
[] -> Right (concat (rights results))
|
||||||
|
(err:_) -> Left err
|
||||||
|
|
||||||
|
|
||||||
|
-- | Function to handle each chunk.
|
||||||
|
getAttachmentsChunk :: T.JSONSettings -> [Int64] -> IO (Either HttpError [Attachments.Attachment])
|
||||||
|
getAttachmentsChunk settings chunk =
|
||||||
get settings path >>= return . eitherDecodeResponse
|
get settings path >>= return . eitherDecodeResponse
|
||||||
|
|
||||||
where
|
where
|
||||||
path :: String = "/attachments?post_id=in.(" ++ hashes ++ ")"
|
path = "/attachments?post_id=in.(" ++ intercalate "," (map show chunk) ++ ")"
|
||||||
hashes :: String = intercalate "," $ (map show post_ids)
|
|
||||||
|
|
||||||
|
|
||||||
postAttachments
|
postAttachments
|
||||||
:: T.JSONSettings
|
:: T.JSONSettings
|
||||||
-> [ Attachments.Attachment ]
|
-> [ Attachments.Attachment ]
|
||||||
-> IO (Either HttpError [ Attachments.Attachment ])
|
-> IO (Either HttpError [ Attachments.Attachment ])
|
||||||
postAttachments settings attachments =
|
postAttachments settings attachments = do
|
||||||
|
BL.putStrLn payload
|
||||||
post settings "/attachments" payload True >>= return . eitherDecodeResponse
|
post settings "/attachments" payload True >>= return . eitherDecodeResponse
|
||||||
|
|
||||||
where
|
where
|
||||||
|
|
Loading…
Reference in New Issue