WIP: Saving attachments

- handle legacy json case for attachement parsing
- compute sha256
- computing phash is TODO
This commit is contained in:
towards-a-new-leftypol 2024-01-19 19:05:02 -05:00
parent a78464fa81
commit 97884ee425
8 changed files with 181 additions and 23 deletions

View File

@ -76,6 +76,7 @@ executable chan-delorean
JSONCommonTypes JSONCommonTypes
Common.PostsType Common.PostsType
AttachmentType AttachmentType
Hash
-- LANGUAGE extensions used by modules in this package. -- LANGUAGE extensions used by modules in this package.
-- other-extensions: -- other-extensions:
@ -92,7 +93,10 @@ executable chan-delorean
http-conduit, http-conduit,
safe-exceptions, safe-exceptions,
http-types, http-types,
time time,
cryptonite,
memory,
mime-types
-- Directories containing source files. -- Directories containing source files.
hs-source-dirs: src hs-source-dirs: src

View File

@ -5,7 +5,7 @@ let
f = { mkDerivation, base, stdenv, cabal-install, f = { mkDerivation, base, stdenv, cabal-install,
aeson, safe-exceptions, bytestring, cmdargs, aeson, safe-exceptions, bytestring, cmdargs,
http-conduit http-conduit, cryptonite, memory, mime-types
}: }:
mkDerivation { mkDerivation {
pname = "chan-delorean"; pname = "chan-delorean";
@ -15,6 +15,7 @@ let
isExecutable = true; isExecutable = true;
executableHaskellDepends = [ executableHaskellDepends = [
base safe-exceptions aeson bytestring cmdargs http-conduit base safe-exceptions aeson bytestring cmdargs http-conduit
cryptonite memory mime-types
]; ];
testHaskellDepends = [ cabal-install ]; testHaskellDepends = [ cabal-install ];
license = "unknown"; license = "unknown";

View File

@ -10,6 +10,7 @@ WHERE opc.oid >= 16384 AND NOT amvalidate(opc.oid);
-- </bktree/sql/init.sql> -- </bktree/sql/init.sql>
DROP TYPE IF EXISTS dimension CASCADE;
DROP TABLE IF EXISTS sites CASCADE; DROP TABLE IF EXISTS sites CASCADE;
DROP TABLE IF EXISTS boards CASCADE; DROP TABLE IF EXISTS boards CASCADE;
DROP TABLE IF EXISTS threads CASCADE; DROP TABLE IF EXISTS threads CASCADE;
@ -100,6 +101,11 @@ ON posts
FOR EACH ROW FOR EACH ROW
EXECUTE FUNCTION update_post_body_search_index(); EXECUTE FUNCTION update_post_body_search_index();
CREATE TYPE dimension AS
( width int
, height int
);
CREATE TABLE IF NOT EXISTS attachments CREATE TABLE IF NOT EXISTS attachments
( attachment_id bigserial primary key ( attachment_id bigserial primary key
, mimetype text NOT NULL , mimetype text NOT NULL
@ -108,6 +114,7 @@ CREATE TABLE IF NOT EXISTS attachments
, phash bigint , phash bigint
, illegal boolean NOT NULL DEFAULT false , illegal boolean NOT NULL DEFAULT false
, post_id bigint NOT NULL , post_id bigint NOT NULL
, resolution dimension
, CHECK , CHECK
( (
(mimetype NOT IN ('image/jpeg', 'image/png', 'image/gif')) (mimetype NOT IN ('image/jpeg', 'image/png', 'image/gif'))

View File

@ -0,0 +1,38 @@
BEGIN TRANSACTION;
DROP TYPE IF EXISTS dimension CASCADE;
DROP TABLE IF EXISTS attachments CASCADE;
CREATE TYPE dimension AS
( width int
, height int
);
CREATE TABLE IF NOT EXISTS attachments
( attachment_id bigserial primary key
, mimetype text NOT NULL
, creation_time timestamp with time zone NOT NULL
, sha256_hash text NOT NULL UNIQUE
, phash bigint
, illegal boolean NOT NULL DEFAULT false
, post_id bigint NOT NULL
, resolution dimension
, CHECK
(
(mimetype NOT IN ('image/jpeg', 'image/png', 'image/gif'))
OR
(phash IS NOT NULL)
)
, CONSTRAINT post_fk FOREIGN KEY (post_id) REFERENCES posts (post_id) ON DELETE CASCADE
);
CREATE INDEX attachments_creation_time_idx ON attachments (creation_time);
CREATE INDEX attachments_post_id_idx ON attachments (post_id);
CREATE INDEX attachments_sha256_hash_idx ON attachments (sha256_hash);
--
-- Index using the bktree extension for quickly getting the closest phashes
CREATE INDEX attachments_phash_bktree_index ON attachments USING spgist (phash bktree_ops);
GRANT SELECT ON attachments TO chan_archive_anon;
GRANT ALL ON attachments TO chan_archiver;
COMMIT;

View File

@ -1,6 +1,8 @@
{-# LANGUAGE DeriveAnyClass #-} {-# LANGUAGE DeriveAnyClass #-}
module AttachmentType module AttachmentType
( Attachment (..) ( Attachment (..)
, Dimension (..)
, Paths (..)
) where ) where
import GHC.Generics import GHC.Generics
@ -9,6 +11,16 @@ import Data.Aeson (FromJSON, ToJSON)
import Data.Text (Text) import Data.Text (Text)
import Data.Time.Clock (UTCTime) import Data.Time.Clock (UTCTime)
data Dimension = Dimension
{ width :: Int
, height :: Int
} deriving (Show, Generic, FromJSON, ToJSON)
data Paths = Paths
{ file_path :: Text
, thumbnail_path :: Text
}
data Attachment = Attachment data Attachment = Attachment
{ attachment_id :: Maybe Int64 { attachment_id :: Maybe Int64
, mimetype :: Text , mimetype :: Text
@ -17,4 +29,5 @@ data Attachment = Attachment
, phash :: Int64 , phash :: Int64
, illegal :: Bool , illegal :: Bool
, post_id :: Int64 , post_id :: Int64
, resolution :: Maybe Dimension
} deriving (Show, Generic, FromJSON, ToJSON) } deriving (Show, Generic, FromJSON, ToJSON)

View File

@ -20,6 +20,9 @@ import Data.Time.Clock.POSIX (posixSecondsToUTCTime)
import Data.Time.Clock (UTCTime) import Data.Time.Clock (UTCTime)
import qualified Data.Map as Map import qualified Data.Map as Map
import Data.Maybe (fromJust) import Data.Maybe (fromJust)
import Data.Text (Text, unpack)
import Data.Text.Encoding (decodeUtf8)
import Network.Mime (defaultMimeLookup, defaultMimeType)
import JSONParsing import JSONParsing
import JSONSettings import JSONSettings
@ -29,8 +32,9 @@ import qualified DataClient as Client
import qualified SitesType as Sites import qualified SitesType as Sites
import qualified BoardsType as Boards import qualified BoardsType as Boards
import qualified ThreadType as Threads import qualified ThreadType as Threads
import qualified AttachmentType as Attachments import qualified AttachmentType as At
import qualified Common.PostsType as Posts import qualified Common.PostsType as Posts
import qualified Hash as Hash
newtype SettingsCLI = SettingsCLI newtype SettingsCLI = SettingsCLI
{ jsonFile :: FilePath { jsonFile :: FilePath
@ -228,18 +232,29 @@ setPostIdInPosts post_pairs ids = map f ids
(\(i, j) -> (i, j { Posts.post_id = Just asdf1 })) (post_map Map.! (asdf2, asdf3)) (\(i, j) -> (i, j { Posts.post_id = Just asdf1 })) (post_map Map.! (asdf2, asdf3))
fileToAttachment :: Posts.Post -> JS.File -> Attachments.Attachment fileToAttachment :: Posts.Post -> JS.File -> At.Attachment
fileToAttachment post file = fileToAttachment post file =
Attachments.Attachment At.Attachment
{ Attachments.attachment_id = Nothing { At.attachment_id = Nothing
, Attachments.mimetype = maybe "undefined/undefined" id (JS.mime file) , At.mimetype = maybe "undefined/undefined" id (JS.mime file)
, Attachments.creation_time = Posts.creation_time post , At.creation_time = Posts.creation_time post
, Attachments.sha256_hash = undefined , At.sha256_hash = undefined
, Attachments.phash = undefined , At.phash = undefined
, Attachments.illegal = False , At.illegal = False
, Attachments.post_id = fromJust $ Posts.post_id post , At.post_id = fromJust $ Posts.post_id post
, At.resolution = dim
} }
where
dim = (JS.w file) >>= \w ->
((JS.h file) >>= \h ->
Just $ At.Dimension w h)
getMimeType :: Text -> Text
getMimeType ext = decodeUtf8 $ defaultMimeLookup ext
processFiles :: JSONSettings -> [(JSONPosts.Post, Posts.Post)] -> IO () processFiles :: JSONSettings -> [(JSONPosts.Post, Posts.Post)] -> IO ()
processFiles settings post_pairs = do -- perfect just means that our posts have ids, they're already inserted into the db processFiles settings post_pairs = do -- perfect just means that our posts have ids, they're already inserted into the db
existing_attachments_result <- Client.getAttachments settings (map (fromJust . Posts.post_id . snd) post_pairs) existing_attachments_result <- Client.getAttachments settings (map (fromJust . Posts.post_id . snd) post_pairs)
@ -249,8 +264,8 @@ processFiles settings post_pairs = do -- perfect just means that our posts have
putStrLn $ "Error fetching boards: " ++ show err putStrLn $ "Error fetching boards: " ++ show err
exitFailure exitFailure
Right existing_attachments -> do Right existing_attachments -> do
let map_existing :: Map.Map Int64 [ Attachments.Attachment ] = let map_existing :: Map.Map Int64 [ At.Attachment ] =
foldl (insertRecord Attachments.post_id) Map.empty existing_attachments foldl (insertRecord At.post_id) Map.empty existing_attachments
-- have things like sha256 already -- have things like sha256 already
-- how do we know that a `elem` attachments_on_board and a `elem` existing_attachments -- how do we know that a `elem` attachments_on_board and a `elem` existing_attachments
@ -258,22 +273,79 @@ processFiles settings post_pairs = do -- perfect just means that our posts have
-- same with attachments_on_board into `map_should_exist :: Map post_id -> Set Attachment` -- same with attachments_on_board into `map_should_exist :: Map post_id -> Set Attachment`
-- --
-- then run through the keys and compare size of the sets -- then run through the keys and compare size of the sets
let attachments_on_board :: [(At.Paths, At.Attachment)] =
concatMap parseAttachments post_pairs
let attachments_on_board :: [(JS.File, Attachments.Attachment)] = concatMap let map_should_exist :: Map.Map Int64 [(At.Paths, At.Attachment)] =
(\(p, q) -> map (\x -> (x, fileToAttachment q x)) (maybe [] id $ JSONPosts.files p)) foldl (insertRecord (At.post_id . snd)) Map.empty attachments_on_board
post_pairs
let map_should_exist :: Map.Map Int64 [(JS.File, Attachments.Attachment)] =
foldl (insertRecord (Attachments.post_id . snd)) Map.empty attachments_on_board
let to_insert_map = Map.filterWithKey (compareAttMaps map_existing) map_should_exist let to_insert_map = Map.filterWithKey (compareAttMaps map_existing) map_should_exist
let to_insert = foldr (++) [] $ Map.elems to_insert_map
have_hash <- mapM computeAttachmentHash to_insert
let existing_hashes :: Set.Set Text =
Set.fromList $ map At.sha256_hash existing_attachments
let to_insert_ = filter ((`Set.notMember` existing_hashes) . At.sha256_hash) have_hash
-- TODO: Concat all values in to_insert_map and -- TODO: Concat all values in to_insert_map and
-- go ahead and compute sha256 and phashes on them. -- go ahead and compute sha256 and phashes on them.
-- ensure that the sha256 in to_insert_map is not in map_existing
return () return ()
where where
computeAttachmentHash :: (At.Paths, At.Attachment) -> IO At.Attachment
computeAttachmentHash (p, q) = do
let f = backup_read_root settings <> "/" <> unpack (At.file_path p)
sha256_sum <- Hash.computeSHA256 f
return q { At.sha256_hash = sha256_sum }
computeAttachmentPhash :: (At.Paths, At.Attachment) -> IO At.Attachment
computeAttachmentPhash = undefined
parseLegacyPaths :: JSONPosts.Post -> Maybe At.Paths
parseLegacyPaths post = do
tim <- JSONPosts.tim post
ext <- JSONPosts.ext post
let board = JSONPosts.board post
let file_path = board <> "/src/" <> tim <> ext
let thumbnail_path = board <> "/thumb/" <> tim <> ext
Just $ At.Paths file_path thumbnail_path
parseAttachments :: (JSONPosts.Post, Posts.Post) -> [(At.Paths, At.Attachment)]
parseAttachments (p, q) =
case JSONPosts.files p of
Just files -> map (\x -> (At.Paths (JS.file_path x) (JS.thumb_path x), fileToAttachment q x)) files
Nothing ->
case parseLegacyPaths p of
Nothing -> []
Just paths ->
let
dim = (JSONPosts.w p) >>= \w -> ((JSONPosts.h p) >>= \h -> Just $ At.Dimension w h)
mime = maybe (decodeUtf8 defaultMimeType) getMimeType $ JSONPosts.ext p
in
( paths
, At.Attachment
{ At.attachment_id = Nothing
, At.mimetype = mime
, At.creation_time = Posts.creation_time q
, At.sha256_hash = undefined
, At.phash = undefined
, At.illegal = False
, At.post_id = fromJust $ Posts.post_id q
, At.resolution = dim
}
) : []
insertRecord insertRecord
:: Ord a :: Ord a
=> (b -> a) => (b -> a)
@ -286,9 +358,9 @@ processFiles settings post_pairs = do -- perfect just means that our posts have
in Map.insert pid (x : l) accMap in Map.insert pid (x : l) accMap
compareAttMaps compareAttMaps
:: Map.Map Int64 [ Attachments.Attachment ] :: Map.Map Int64 [ At.Attachment ]
-> Int64 -> Int64
-> [(JS.File, Attachments.Attachment)] -> [(At.Paths, At.Attachment)]
-> Bool -> Bool
compareAttMaps existing k v compareAttMaps existing k v
= (maybe (-1) length (Map.lookup k existing)) /= length v = (maybe (-1) length (Map.lookup k existing)) /= length v

16
src/Hash.hs Normal file
View File

@ -0,0 +1,16 @@
module Hash
( computeSHA256
) where
import Crypto.Hash (hashlazy, Digest, SHA256)
import qualified Data.ByteString.Lazy as B
import qualified Data.ByteArray.Encoding as BA
import Data.Text (Text)
import qualified Data.Text.Encoding as T
-- Function to compute SHA256 hash of a file
computeSHA256 :: FilePath -> IO Text
computeSHA256 filePath = do
fileData <- B.readFile filePath
let hashDigest = hashlazy fileData :: Digest SHA256
return $ T.decodeUtf8 $ BA.convertToBase BA.Base16 hashDigest

View File

@ -22,10 +22,17 @@ data Post = Post
, locked :: Maybe Int , locked :: Maybe Int
, cyclical :: Maybe J.Cyclical , cyclical :: Maybe J.Cyclical
, last_modified :: Int , last_modified :: Int
, board :: String , board :: Text
, files :: Maybe [J.File] , files :: Maybe [J.File]
, resto :: Int , resto :: Int
, unique_ips :: Maybe Int , unique_ips :: Maybe Int
-- legacy attributes
, filename :: Maybe Text
, h :: Maybe Int
, w :: Maybe Int
, ext :: Maybe Text
, tim :: Maybe Text
} deriving (Show, Generic) } deriving (Show, Generic)
instance FromJSON Post instance FromJSON Post