WIP: Saving attachments
- handle legacy json case for attachement parsing - compute sha256 - computing phash is TODO
This commit is contained in:
parent
a78464fa81
commit
97884ee425
|
@ -76,6 +76,7 @@ executable chan-delorean
|
||||||
JSONCommonTypes
|
JSONCommonTypes
|
||||||
Common.PostsType
|
Common.PostsType
|
||||||
AttachmentType
|
AttachmentType
|
||||||
|
Hash
|
||||||
|
|
||||||
-- LANGUAGE extensions used by modules in this package.
|
-- LANGUAGE extensions used by modules in this package.
|
||||||
-- other-extensions:
|
-- other-extensions:
|
||||||
|
@ -92,7 +93,10 @@ executable chan-delorean
|
||||||
http-conduit,
|
http-conduit,
|
||||||
safe-exceptions,
|
safe-exceptions,
|
||||||
http-types,
|
http-types,
|
||||||
time
|
time,
|
||||||
|
cryptonite,
|
||||||
|
memory,
|
||||||
|
mime-types
|
||||||
|
|
||||||
-- Directories containing source files.
|
-- Directories containing source files.
|
||||||
hs-source-dirs: src
|
hs-source-dirs: src
|
||||||
|
|
|
@ -5,7 +5,7 @@ let
|
||||||
|
|
||||||
f = { mkDerivation, base, stdenv, cabal-install,
|
f = { mkDerivation, base, stdenv, cabal-install,
|
||||||
aeson, safe-exceptions, bytestring, cmdargs,
|
aeson, safe-exceptions, bytestring, cmdargs,
|
||||||
http-conduit
|
http-conduit, cryptonite, memory, mime-types
|
||||||
}:
|
}:
|
||||||
mkDerivation {
|
mkDerivation {
|
||||||
pname = "chan-delorean";
|
pname = "chan-delorean";
|
||||||
|
@ -15,6 +15,7 @@ let
|
||||||
isExecutable = true;
|
isExecutable = true;
|
||||||
executableHaskellDepends = [
|
executableHaskellDepends = [
|
||||||
base safe-exceptions aeson bytestring cmdargs http-conduit
|
base safe-exceptions aeson bytestring cmdargs http-conduit
|
||||||
|
cryptonite memory mime-types
|
||||||
];
|
];
|
||||||
testHaskellDepends = [ cabal-install ];
|
testHaskellDepends = [ cabal-install ];
|
||||||
license = "unknown";
|
license = "unknown";
|
||||||
|
|
|
@ -10,6 +10,7 @@ WHERE opc.oid >= 16384 AND NOT amvalidate(opc.oid);
|
||||||
|
|
||||||
-- </bktree/sql/init.sql>
|
-- </bktree/sql/init.sql>
|
||||||
|
|
||||||
|
DROP TYPE IF EXISTS dimension CASCADE;
|
||||||
DROP TABLE IF EXISTS sites CASCADE;
|
DROP TABLE IF EXISTS sites CASCADE;
|
||||||
DROP TABLE IF EXISTS boards CASCADE;
|
DROP TABLE IF EXISTS boards CASCADE;
|
||||||
DROP TABLE IF EXISTS threads CASCADE;
|
DROP TABLE IF EXISTS threads CASCADE;
|
||||||
|
@ -100,6 +101,11 @@ ON posts
|
||||||
FOR EACH ROW
|
FOR EACH ROW
|
||||||
EXECUTE FUNCTION update_post_body_search_index();
|
EXECUTE FUNCTION update_post_body_search_index();
|
||||||
|
|
||||||
|
CREATE TYPE dimension AS
|
||||||
|
( width int
|
||||||
|
, height int
|
||||||
|
);
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS attachments
|
CREATE TABLE IF NOT EXISTS attachments
|
||||||
( attachment_id bigserial primary key
|
( attachment_id bigserial primary key
|
||||||
, mimetype text NOT NULL
|
, mimetype text NOT NULL
|
||||||
|
@ -108,6 +114,7 @@ CREATE TABLE IF NOT EXISTS attachments
|
||||||
, phash bigint
|
, phash bigint
|
||||||
, illegal boolean NOT NULL DEFAULT false
|
, illegal boolean NOT NULL DEFAULT false
|
||||||
, post_id bigint NOT NULL
|
, post_id bigint NOT NULL
|
||||||
|
, resolution dimension
|
||||||
, CHECK
|
, CHECK
|
||||||
(
|
(
|
||||||
(mimetype NOT IN ('image/jpeg', 'image/png', 'image/gif'))
|
(mimetype NOT IN ('image/jpeg', 'image/png', 'image/gif'))
|
||||||
|
|
|
@ -0,0 +1,38 @@
|
||||||
|
BEGIN TRANSACTION;
|
||||||
|
|
||||||
|
DROP TYPE IF EXISTS dimension CASCADE;
|
||||||
|
DROP TABLE IF EXISTS attachments CASCADE;
|
||||||
|
|
||||||
|
CREATE TYPE dimension AS
|
||||||
|
( width int
|
||||||
|
, height int
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS attachments
|
||||||
|
( attachment_id bigserial primary key
|
||||||
|
, mimetype text NOT NULL
|
||||||
|
, creation_time timestamp with time zone NOT NULL
|
||||||
|
, sha256_hash text NOT NULL UNIQUE
|
||||||
|
, phash bigint
|
||||||
|
, illegal boolean NOT NULL DEFAULT false
|
||||||
|
, post_id bigint NOT NULL
|
||||||
|
, resolution dimension
|
||||||
|
, CHECK
|
||||||
|
(
|
||||||
|
(mimetype NOT IN ('image/jpeg', 'image/png', 'image/gif'))
|
||||||
|
OR
|
||||||
|
(phash IS NOT NULL)
|
||||||
|
)
|
||||||
|
, CONSTRAINT post_fk FOREIGN KEY (post_id) REFERENCES posts (post_id) ON DELETE CASCADE
|
||||||
|
);
|
||||||
|
CREATE INDEX attachments_creation_time_idx ON attachments (creation_time);
|
||||||
|
CREATE INDEX attachments_post_id_idx ON attachments (post_id);
|
||||||
|
CREATE INDEX attachments_sha256_hash_idx ON attachments (sha256_hash);
|
||||||
|
--
|
||||||
|
-- Index using the bktree extension for quickly getting the closest phashes
|
||||||
|
CREATE INDEX attachments_phash_bktree_index ON attachments USING spgist (phash bktree_ops);
|
||||||
|
|
||||||
|
GRANT SELECT ON attachments TO chan_archive_anon;
|
||||||
|
GRANT ALL ON attachments TO chan_archiver;
|
||||||
|
|
||||||
|
COMMIT;
|
|
@ -1,6 +1,8 @@
|
||||||
{-# LANGUAGE DeriveAnyClass #-}
|
{-# LANGUAGE DeriveAnyClass #-}
|
||||||
module AttachmentType
|
module AttachmentType
|
||||||
( Attachment (..)
|
( Attachment (..)
|
||||||
|
, Dimension (..)
|
||||||
|
, Paths (..)
|
||||||
) where
|
) where
|
||||||
|
|
||||||
import GHC.Generics
|
import GHC.Generics
|
||||||
|
@ -9,6 +11,16 @@ import Data.Aeson (FromJSON, ToJSON)
|
||||||
import Data.Text (Text)
|
import Data.Text (Text)
|
||||||
import Data.Time.Clock (UTCTime)
|
import Data.Time.Clock (UTCTime)
|
||||||
|
|
||||||
|
data Dimension = Dimension
|
||||||
|
{ width :: Int
|
||||||
|
, height :: Int
|
||||||
|
} deriving (Show, Generic, FromJSON, ToJSON)
|
||||||
|
|
||||||
|
data Paths = Paths
|
||||||
|
{ file_path :: Text
|
||||||
|
, thumbnail_path :: Text
|
||||||
|
}
|
||||||
|
|
||||||
data Attachment = Attachment
|
data Attachment = Attachment
|
||||||
{ attachment_id :: Maybe Int64
|
{ attachment_id :: Maybe Int64
|
||||||
, mimetype :: Text
|
, mimetype :: Text
|
||||||
|
@ -17,4 +29,5 @@ data Attachment = Attachment
|
||||||
, phash :: Int64
|
, phash :: Int64
|
||||||
, illegal :: Bool
|
, illegal :: Bool
|
||||||
, post_id :: Int64
|
, post_id :: Int64
|
||||||
|
, resolution :: Maybe Dimension
|
||||||
} deriving (Show, Generic, FromJSON, ToJSON)
|
} deriving (Show, Generic, FromJSON, ToJSON)
|
||||||
|
|
112
src/Backfill.hs
112
src/Backfill.hs
|
@ -20,6 +20,9 @@ import Data.Time.Clock.POSIX (posixSecondsToUTCTime)
|
||||||
import Data.Time.Clock (UTCTime)
|
import Data.Time.Clock (UTCTime)
|
||||||
import qualified Data.Map as Map
|
import qualified Data.Map as Map
|
||||||
import Data.Maybe (fromJust)
|
import Data.Maybe (fromJust)
|
||||||
|
import Data.Text (Text, unpack)
|
||||||
|
import Data.Text.Encoding (decodeUtf8)
|
||||||
|
import Network.Mime (defaultMimeLookup, defaultMimeType)
|
||||||
|
|
||||||
import JSONParsing
|
import JSONParsing
|
||||||
import JSONSettings
|
import JSONSettings
|
||||||
|
@ -29,8 +32,9 @@ import qualified DataClient as Client
|
||||||
import qualified SitesType as Sites
|
import qualified SitesType as Sites
|
||||||
import qualified BoardsType as Boards
|
import qualified BoardsType as Boards
|
||||||
import qualified ThreadType as Threads
|
import qualified ThreadType as Threads
|
||||||
import qualified AttachmentType as Attachments
|
import qualified AttachmentType as At
|
||||||
import qualified Common.PostsType as Posts
|
import qualified Common.PostsType as Posts
|
||||||
|
import qualified Hash as Hash
|
||||||
|
|
||||||
newtype SettingsCLI = SettingsCLI
|
newtype SettingsCLI = SettingsCLI
|
||||||
{ jsonFile :: FilePath
|
{ jsonFile :: FilePath
|
||||||
|
@ -228,18 +232,29 @@ setPostIdInPosts post_pairs ids = map f ids
|
||||||
(\(i, j) -> (i, j { Posts.post_id = Just asdf1 })) (post_map Map.! (asdf2, asdf3))
|
(\(i, j) -> (i, j { Posts.post_id = Just asdf1 })) (post_map Map.! (asdf2, asdf3))
|
||||||
|
|
||||||
|
|
||||||
fileToAttachment :: Posts.Post -> JS.File -> Attachments.Attachment
|
fileToAttachment :: Posts.Post -> JS.File -> At.Attachment
|
||||||
fileToAttachment post file =
|
fileToAttachment post file =
|
||||||
Attachments.Attachment
|
At.Attachment
|
||||||
{ Attachments.attachment_id = Nothing
|
{ At.attachment_id = Nothing
|
||||||
, Attachments.mimetype = maybe "undefined/undefined" id (JS.mime file)
|
, At.mimetype = maybe "undefined/undefined" id (JS.mime file)
|
||||||
, Attachments.creation_time = Posts.creation_time post
|
, At.creation_time = Posts.creation_time post
|
||||||
, Attachments.sha256_hash = undefined
|
, At.sha256_hash = undefined
|
||||||
, Attachments.phash = undefined
|
, At.phash = undefined
|
||||||
, Attachments.illegal = False
|
, At.illegal = False
|
||||||
, Attachments.post_id = fromJust $ Posts.post_id post
|
, At.post_id = fromJust $ Posts.post_id post
|
||||||
|
, At.resolution = dim
|
||||||
}
|
}
|
||||||
|
|
||||||
|
where
|
||||||
|
dim = (JS.w file) >>= \w ->
|
||||||
|
((JS.h file) >>= \h ->
|
||||||
|
Just $ At.Dimension w h)
|
||||||
|
|
||||||
|
|
||||||
|
getMimeType :: Text -> Text
|
||||||
|
getMimeType ext = decodeUtf8 $ defaultMimeLookup ext
|
||||||
|
|
||||||
|
|
||||||
processFiles :: JSONSettings -> [(JSONPosts.Post, Posts.Post)] -> IO ()
|
processFiles :: JSONSettings -> [(JSONPosts.Post, Posts.Post)] -> IO ()
|
||||||
processFiles settings post_pairs = do -- perfect just means that our posts have ids, they're already inserted into the db
|
processFiles settings post_pairs = do -- perfect just means that our posts have ids, they're already inserted into the db
|
||||||
existing_attachments_result <- Client.getAttachments settings (map (fromJust . Posts.post_id . snd) post_pairs)
|
existing_attachments_result <- Client.getAttachments settings (map (fromJust . Posts.post_id . snd) post_pairs)
|
||||||
|
@ -249,8 +264,8 @@ processFiles settings post_pairs = do -- perfect just means that our posts have
|
||||||
putStrLn $ "Error fetching boards: " ++ show err
|
putStrLn $ "Error fetching boards: " ++ show err
|
||||||
exitFailure
|
exitFailure
|
||||||
Right existing_attachments -> do
|
Right existing_attachments -> do
|
||||||
let map_existing :: Map.Map Int64 [ Attachments.Attachment ] =
|
let map_existing :: Map.Map Int64 [ At.Attachment ] =
|
||||||
foldl (insertRecord Attachments.post_id) Map.empty existing_attachments
|
foldl (insertRecord At.post_id) Map.empty existing_attachments
|
||||||
|
|
||||||
-- have things like sha256 already
|
-- have things like sha256 already
|
||||||
-- how do we know that a `elem` attachments_on_board and a `elem` existing_attachments
|
-- how do we know that a `elem` attachments_on_board and a `elem` existing_attachments
|
||||||
|
@ -258,22 +273,79 @@ processFiles settings post_pairs = do -- perfect just means that our posts have
|
||||||
-- same with attachments_on_board into `map_should_exist :: Map post_id -> Set Attachment`
|
-- same with attachments_on_board into `map_should_exist :: Map post_id -> Set Attachment`
|
||||||
--
|
--
|
||||||
-- then run through the keys and compare size of the sets
|
-- then run through the keys and compare size of the sets
|
||||||
|
let attachments_on_board :: [(At.Paths, At.Attachment)] =
|
||||||
|
concatMap parseAttachments post_pairs
|
||||||
|
|
||||||
let attachments_on_board :: [(JS.File, Attachments.Attachment)] = concatMap
|
let map_should_exist :: Map.Map Int64 [(At.Paths, At.Attachment)] =
|
||||||
(\(p, q) -> map (\x -> (x, fileToAttachment q x)) (maybe [] id $ JSONPosts.files p))
|
foldl (insertRecord (At.post_id . snd)) Map.empty attachments_on_board
|
||||||
post_pairs
|
|
||||||
|
|
||||||
let map_should_exist :: Map.Map Int64 [(JS.File, Attachments.Attachment)] =
|
|
||||||
foldl (insertRecord (Attachments.post_id . snd)) Map.empty attachments_on_board
|
|
||||||
|
|
||||||
let to_insert_map = Map.filterWithKey (compareAttMaps map_existing) map_should_exist
|
let to_insert_map = Map.filterWithKey (compareAttMaps map_existing) map_should_exist
|
||||||
|
|
||||||
|
let to_insert = foldr (++) [] $ Map.elems to_insert_map
|
||||||
|
|
||||||
|
have_hash <- mapM computeAttachmentHash to_insert
|
||||||
|
|
||||||
|
let existing_hashes :: Set.Set Text =
|
||||||
|
Set.fromList $ map At.sha256_hash existing_attachments
|
||||||
|
|
||||||
|
let to_insert_ = filter ((`Set.notMember` existing_hashes) . At.sha256_hash) have_hash
|
||||||
|
|
||||||
-- TODO: Concat all values in to_insert_map and
|
-- TODO: Concat all values in to_insert_map and
|
||||||
-- go ahead and compute sha256 and phashes on them.
|
-- go ahead and compute sha256 and phashes on them.
|
||||||
|
-- ensure that the sha256 in to_insert_map is not in map_existing
|
||||||
|
|
||||||
return ()
|
return ()
|
||||||
|
|
||||||
where
|
where
|
||||||
|
computeAttachmentHash :: (At.Paths, At.Attachment) -> IO At.Attachment
|
||||||
|
computeAttachmentHash (p, q) = do
|
||||||
|
let f = backup_read_root settings <> "/" <> unpack (At.file_path p)
|
||||||
|
sha256_sum <- Hash.computeSHA256 f
|
||||||
|
|
||||||
|
return q { At.sha256_hash = sha256_sum }
|
||||||
|
|
||||||
|
computeAttachmentPhash :: (At.Paths, At.Attachment) -> IO At.Attachment
|
||||||
|
computeAttachmentPhash = undefined
|
||||||
|
|
||||||
|
parseLegacyPaths :: JSONPosts.Post -> Maybe At.Paths
|
||||||
|
parseLegacyPaths post = do
|
||||||
|
tim <- JSONPosts.tim post
|
||||||
|
|
||||||
|
ext <- JSONPosts.ext post
|
||||||
|
|
||||||
|
let board = JSONPosts.board post
|
||||||
|
|
||||||
|
let file_path = board <> "/src/" <> tim <> ext
|
||||||
|
let thumbnail_path = board <> "/thumb/" <> tim <> ext
|
||||||
|
|
||||||
|
Just $ At.Paths file_path thumbnail_path
|
||||||
|
|
||||||
|
|
||||||
|
parseAttachments :: (JSONPosts.Post, Posts.Post) -> [(At.Paths, At.Attachment)]
|
||||||
|
parseAttachments (p, q) =
|
||||||
|
case JSONPosts.files p of
|
||||||
|
Just files -> map (\x -> (At.Paths (JS.file_path x) (JS.thumb_path x), fileToAttachment q x)) files
|
||||||
|
Nothing ->
|
||||||
|
case parseLegacyPaths p of
|
||||||
|
Nothing -> []
|
||||||
|
Just paths ->
|
||||||
|
let
|
||||||
|
dim = (JSONPosts.w p) >>= \w -> ((JSONPosts.h p) >>= \h -> Just $ At.Dimension w h)
|
||||||
|
mime = maybe (decodeUtf8 defaultMimeType) getMimeType $ JSONPosts.ext p
|
||||||
|
in
|
||||||
|
( paths
|
||||||
|
, At.Attachment
|
||||||
|
{ At.attachment_id = Nothing
|
||||||
|
, At.mimetype = mime
|
||||||
|
, At.creation_time = Posts.creation_time q
|
||||||
|
, At.sha256_hash = undefined
|
||||||
|
, At.phash = undefined
|
||||||
|
, At.illegal = False
|
||||||
|
, At.post_id = fromJust $ Posts.post_id q
|
||||||
|
, At.resolution = dim
|
||||||
|
}
|
||||||
|
) : []
|
||||||
|
|
||||||
insertRecord
|
insertRecord
|
||||||
:: Ord a
|
:: Ord a
|
||||||
=> (b -> a)
|
=> (b -> a)
|
||||||
|
@ -286,9 +358,9 @@ processFiles settings post_pairs = do -- perfect just means that our posts have
|
||||||
in Map.insert pid (x : l) accMap
|
in Map.insert pid (x : l) accMap
|
||||||
|
|
||||||
compareAttMaps
|
compareAttMaps
|
||||||
:: Map.Map Int64 [ Attachments.Attachment ]
|
:: Map.Map Int64 [ At.Attachment ]
|
||||||
-> Int64
|
-> Int64
|
||||||
-> [(JS.File, Attachments.Attachment)]
|
-> [(At.Paths, At.Attachment)]
|
||||||
-> Bool
|
-> Bool
|
||||||
compareAttMaps existing k v
|
compareAttMaps existing k v
|
||||||
= (maybe (-1) length (Map.lookup k existing)) /= length v
|
= (maybe (-1) length (Map.lookup k existing)) /= length v
|
||||||
|
|
|
@ -0,0 +1,16 @@
|
||||||
|
module Hash
|
||||||
|
( computeSHA256
|
||||||
|
) where
|
||||||
|
|
||||||
|
import Crypto.Hash (hashlazy, Digest, SHA256)
|
||||||
|
import qualified Data.ByteString.Lazy as B
|
||||||
|
import qualified Data.ByteArray.Encoding as BA
|
||||||
|
import Data.Text (Text)
|
||||||
|
import qualified Data.Text.Encoding as T
|
||||||
|
|
||||||
|
-- Function to compute SHA256 hash of a file
|
||||||
|
computeSHA256 :: FilePath -> IO Text
|
||||||
|
computeSHA256 filePath = do
|
||||||
|
fileData <- B.readFile filePath
|
||||||
|
let hashDigest = hashlazy fileData :: Digest SHA256
|
||||||
|
return $ T.decodeUtf8 $ BA.convertToBase BA.Base16 hashDigest
|
|
@ -22,10 +22,17 @@ data Post = Post
|
||||||
, locked :: Maybe Int
|
, locked :: Maybe Int
|
||||||
, cyclical :: Maybe J.Cyclical
|
, cyclical :: Maybe J.Cyclical
|
||||||
, last_modified :: Int
|
, last_modified :: Int
|
||||||
, board :: String
|
, board :: Text
|
||||||
, files :: Maybe [J.File]
|
, files :: Maybe [J.File]
|
||||||
, resto :: Int
|
, resto :: Int
|
||||||
, unique_ips :: Maybe Int
|
, unique_ips :: Maybe Int
|
||||||
|
|
||||||
|
-- legacy attributes
|
||||||
|
, filename :: Maybe Text
|
||||||
|
, h :: Maybe Int
|
||||||
|
, w :: Maybe Int
|
||||||
|
, ext :: Maybe Text
|
||||||
|
, tim :: Maybe Text
|
||||||
} deriving (Show, Generic)
|
} deriving (Show, Generic)
|
||||||
|
|
||||||
instance FromJSON Post
|
instance FromJSON Post
|
||||||
|
|
Loading…
Reference in New Issue