diff --git a/chan-delorean.cabal b/chan-delorean.cabal index ed11fe9..ae32bb7 100644 --- a/chan-delorean.cabal +++ b/chan-delorean.cabal @@ -76,6 +76,7 @@ executable chan-delorean JSONCommonTypes Common.PostsType AttachmentType + Hash -- LANGUAGE extensions used by modules in this package. -- other-extensions: @@ -92,7 +93,10 @@ executable chan-delorean http-conduit, safe-exceptions, http-types, - time + time, + cryptonite, + memory, + mime-types -- Directories containing source files. hs-source-dirs: src diff --git a/shell.nix b/shell.nix index 998b42c..29292c1 100644 --- a/shell.nix +++ b/shell.nix @@ -5,7 +5,7 @@ let f = { mkDerivation, base, stdenv, cabal-install, aeson, safe-exceptions, bytestring, cmdargs, - http-conduit + http-conduit, cryptonite, memory, mime-types }: mkDerivation { pname = "chan-delorean"; @@ -15,6 +15,7 @@ let isExecutable = true; executableHaskellDepends = [ base safe-exceptions aeson bytestring cmdargs http-conduit + cryptonite memory mime-types ]; testHaskellDepends = [ cabal-install ]; license = "unknown"; diff --git a/sql/initialize.sql b/sql/initialize.sql index 3f5fe46..4ce7529 100644 --- a/sql/initialize.sql +++ b/sql/initialize.sql @@ -10,6 +10,7 @@ WHERE opc.oid >= 16384 AND NOT amvalidate(opc.oid); -- +DROP TYPE IF EXISTS dimension CASCADE; DROP TABLE IF EXISTS sites CASCADE; DROP TABLE IF EXISTS boards CASCADE; DROP TABLE IF EXISTS threads CASCADE; @@ -100,6 +101,11 @@ ON posts FOR EACH ROW EXECUTE FUNCTION update_post_body_search_index(); +CREATE TYPE dimension AS + ( width int + , height int + ); + CREATE TABLE IF NOT EXISTS attachments ( attachment_id bigserial primary key , mimetype text NOT NULL @@ -108,6 +114,7 @@ CREATE TABLE IF NOT EXISTS attachments , phash bigint , illegal boolean NOT NULL DEFAULT false , post_id bigint NOT NULL + , resolution dimension , CHECK ( (mimetype NOT IN ('image/jpeg', 'image/png', 'image/gif')) diff --git a/sql/recreate_attachments_table.sql b/sql/recreate_attachments_table.sql new file mode 100644 index 0000000..9d1db4e --- /dev/null +++ b/sql/recreate_attachments_table.sql @@ -0,0 +1,38 @@ +BEGIN TRANSACTION; + +DROP TYPE IF EXISTS dimension CASCADE; +DROP TABLE IF EXISTS attachments CASCADE; + +CREATE TYPE dimension AS + ( width int + , height int + ); + +CREATE TABLE IF NOT EXISTS attachments + ( attachment_id bigserial primary key + , mimetype text NOT NULL + , creation_time timestamp with time zone NOT NULL + , sha256_hash text NOT NULL UNIQUE + , phash bigint + , illegal boolean NOT NULL DEFAULT false + , post_id bigint NOT NULL + , resolution dimension + , CHECK + ( + (mimetype NOT IN ('image/jpeg', 'image/png', 'image/gif')) + OR + (phash IS NOT NULL) + ) + , CONSTRAINT post_fk FOREIGN KEY (post_id) REFERENCES posts (post_id) ON DELETE CASCADE + ); +CREATE INDEX attachments_creation_time_idx ON attachments (creation_time); +CREATE INDEX attachments_post_id_idx ON attachments (post_id); +CREATE INDEX attachments_sha256_hash_idx ON attachments (sha256_hash); +-- +-- Index using the bktree extension for quickly getting the closest phashes +CREATE INDEX attachments_phash_bktree_index ON attachments USING spgist (phash bktree_ops); + +GRANT SELECT ON attachments TO chan_archive_anon; +GRANT ALL ON attachments TO chan_archiver; + +COMMIT; diff --git a/src/AttachmentType.hs b/src/AttachmentType.hs index a20e1d2..48683ea 100644 --- a/src/AttachmentType.hs +++ b/src/AttachmentType.hs @@ -1,6 +1,8 @@ {-# LANGUAGE DeriveAnyClass #-} module AttachmentType ( Attachment (..) +, Dimension (..) +, Paths (..) ) where import GHC.Generics @@ -9,6 +11,16 @@ import Data.Aeson (FromJSON, ToJSON) import Data.Text (Text) import Data.Time.Clock (UTCTime) +data Dimension = Dimension + { width :: Int + , height :: Int + } deriving (Show, Generic, FromJSON, ToJSON) + +data Paths = Paths + { file_path :: Text + , thumbnail_path :: Text + } + data Attachment = Attachment { attachment_id :: Maybe Int64 , mimetype :: Text @@ -17,4 +29,5 @@ data Attachment = Attachment , phash :: Int64 , illegal :: Bool , post_id :: Int64 + , resolution :: Maybe Dimension } deriving (Show, Generic, FromJSON, ToJSON) diff --git a/src/Backfill.hs b/src/Backfill.hs index 039dd65..3c59c12 100644 --- a/src/Backfill.hs +++ b/src/Backfill.hs @@ -20,6 +20,9 @@ import Data.Time.Clock.POSIX (posixSecondsToUTCTime) import Data.Time.Clock (UTCTime) import qualified Data.Map as Map import Data.Maybe (fromJust) +import Data.Text (Text, unpack) +import Data.Text.Encoding (decodeUtf8) +import Network.Mime (defaultMimeLookup, defaultMimeType) import JSONParsing import JSONSettings @@ -29,8 +32,9 @@ import qualified DataClient as Client import qualified SitesType as Sites import qualified BoardsType as Boards import qualified ThreadType as Threads -import qualified AttachmentType as Attachments +import qualified AttachmentType as At import qualified Common.PostsType as Posts +import qualified Hash as Hash newtype SettingsCLI = SettingsCLI { jsonFile :: FilePath @@ -228,18 +232,29 @@ setPostIdInPosts post_pairs ids = map f ids (\(i, j) -> (i, j { Posts.post_id = Just asdf1 })) (post_map Map.! (asdf2, asdf3)) -fileToAttachment :: Posts.Post -> JS.File -> Attachments.Attachment +fileToAttachment :: Posts.Post -> JS.File -> At.Attachment fileToAttachment post file = - Attachments.Attachment - { Attachments.attachment_id = Nothing - , Attachments.mimetype = maybe "undefined/undefined" id (JS.mime file) - , Attachments.creation_time = Posts.creation_time post - , Attachments.sha256_hash = undefined - , Attachments.phash = undefined - , Attachments.illegal = False - , Attachments.post_id = fromJust $ Posts.post_id post + At.Attachment + { At.attachment_id = Nothing + , At.mimetype = maybe "undefined/undefined" id (JS.mime file) + , At.creation_time = Posts.creation_time post + , At.sha256_hash = undefined + , At.phash = undefined + , At.illegal = False + , At.post_id = fromJust $ Posts.post_id post + , At.resolution = dim } + where + dim = (JS.w file) >>= \w -> + ((JS.h file) >>= \h -> + Just $ At.Dimension w h) + + +getMimeType :: Text -> Text +getMimeType ext = decodeUtf8 $ defaultMimeLookup ext + + processFiles :: JSONSettings -> [(JSONPosts.Post, Posts.Post)] -> IO () processFiles settings post_pairs = do -- perfect just means that our posts have ids, they're already inserted into the db existing_attachments_result <- Client.getAttachments settings (map (fromJust . Posts.post_id . snd) post_pairs) @@ -249,8 +264,8 @@ processFiles settings post_pairs = do -- perfect just means that our posts have putStrLn $ "Error fetching boards: " ++ show err exitFailure Right existing_attachments -> do - let map_existing :: Map.Map Int64 [ Attachments.Attachment ] = - foldl (insertRecord Attachments.post_id) Map.empty existing_attachments + let map_existing :: Map.Map Int64 [ At.Attachment ] = + foldl (insertRecord At.post_id) Map.empty existing_attachments -- have things like sha256 already -- how do we know that a `elem` attachments_on_board and a `elem` existing_attachments @@ -258,22 +273,79 @@ processFiles settings post_pairs = do -- perfect just means that our posts have -- same with attachments_on_board into `map_should_exist :: Map post_id -> Set Attachment` -- -- then run through the keys and compare size of the sets + let attachments_on_board :: [(At.Paths, At.Attachment)] = + concatMap parseAttachments post_pairs - let attachments_on_board :: [(JS.File, Attachments.Attachment)] = concatMap - (\(p, q) -> map (\x -> (x, fileToAttachment q x)) (maybe [] id $ JSONPosts.files p)) - post_pairs - - let map_should_exist :: Map.Map Int64 [(JS.File, Attachments.Attachment)] = - foldl (insertRecord (Attachments.post_id . snd)) Map.empty attachments_on_board + let map_should_exist :: Map.Map Int64 [(At.Paths, At.Attachment)] = + foldl (insertRecord (At.post_id . snd)) Map.empty attachments_on_board let to_insert_map = Map.filterWithKey (compareAttMaps map_existing) map_should_exist + let to_insert = foldr (++) [] $ Map.elems to_insert_map + + have_hash <- mapM computeAttachmentHash to_insert + + let existing_hashes :: Set.Set Text = + Set.fromList $ map At.sha256_hash existing_attachments + + let to_insert_ = filter ((`Set.notMember` existing_hashes) . At.sha256_hash) have_hash + -- TODO: Concat all values in to_insert_map and -- go ahead and compute sha256 and phashes on them. + -- ensure that the sha256 in to_insert_map is not in map_existing return () where + computeAttachmentHash :: (At.Paths, At.Attachment) -> IO At.Attachment + computeAttachmentHash (p, q) = do + let f = backup_read_root settings <> "/" <> unpack (At.file_path p) + sha256_sum <- Hash.computeSHA256 f + + return q { At.sha256_hash = sha256_sum } + + computeAttachmentPhash :: (At.Paths, At.Attachment) -> IO At.Attachment + computeAttachmentPhash = undefined + + parseLegacyPaths :: JSONPosts.Post -> Maybe At.Paths + parseLegacyPaths post = do + tim <- JSONPosts.tim post + + ext <- JSONPosts.ext post + + let board = JSONPosts.board post + + let file_path = board <> "/src/" <> tim <> ext + let thumbnail_path = board <> "/thumb/" <> tim <> ext + + Just $ At.Paths file_path thumbnail_path + + + parseAttachments :: (JSONPosts.Post, Posts.Post) -> [(At.Paths, At.Attachment)] + parseAttachments (p, q) = + case JSONPosts.files p of + Just files -> map (\x -> (At.Paths (JS.file_path x) (JS.thumb_path x), fileToAttachment q x)) files + Nothing -> + case parseLegacyPaths p of + Nothing -> [] + Just paths -> + let + dim = (JSONPosts.w p) >>= \w -> ((JSONPosts.h p) >>= \h -> Just $ At.Dimension w h) + mime = maybe (decodeUtf8 defaultMimeType) getMimeType $ JSONPosts.ext p + in + ( paths + , At.Attachment + { At.attachment_id = Nothing + , At.mimetype = mime + , At.creation_time = Posts.creation_time q + , At.sha256_hash = undefined + , At.phash = undefined + , At.illegal = False + , At.post_id = fromJust $ Posts.post_id q + , At.resolution = dim + } + ) : [] + insertRecord :: Ord a => (b -> a) @@ -286,9 +358,9 @@ processFiles settings post_pairs = do -- perfect just means that our posts have in Map.insert pid (x : l) accMap compareAttMaps - :: Map.Map Int64 [ Attachments.Attachment ] + :: Map.Map Int64 [ At.Attachment ] -> Int64 - -> [(JS.File, Attachments.Attachment)] + -> [(At.Paths, At.Attachment)] -> Bool compareAttMaps existing k v = (maybe (-1) length (Map.lookup k existing)) /= length v diff --git a/src/Hash.hs b/src/Hash.hs new file mode 100644 index 0000000..e37afae --- /dev/null +++ b/src/Hash.hs @@ -0,0 +1,16 @@ +module Hash +( computeSHA256 +) where + +import Crypto.Hash (hashlazy, Digest, SHA256) +import qualified Data.ByteString.Lazy as B +import qualified Data.ByteArray.Encoding as BA +import Data.Text (Text) +import qualified Data.Text.Encoding as T + +-- Function to compute SHA256 hash of a file +computeSHA256 :: FilePath -> IO Text +computeSHA256 filePath = do + fileData <- B.readFile filePath + let hashDigest = hashlazy fileData :: Digest SHA256 + return $ T.decodeUtf8 $ BA.convertToBase BA.Base16 hashDigest diff --git a/src/JSONPost.hs b/src/JSONPost.hs index a55f342..58fcfa8 100644 --- a/src/JSONPost.hs +++ b/src/JSONPost.hs @@ -22,10 +22,17 @@ data Post = Post , locked :: Maybe Int , cyclical :: Maybe J.Cyclical , last_modified :: Int - , board :: String + , board :: Text , files :: Maybe [J.File] , resto :: Int , unique_ips :: Maybe Int + + -- legacy attributes + , filename :: Maybe Text + , h :: Maybe Int + , w :: Maybe Int + , ext :: Maybe Text + , tim :: Maybe Text } deriving (Show, Generic) instance FromJSON Post