WIP: Saving attachments

- handle legacy json case for attachement parsing
- compute sha256
- computing phash is TODO
This commit is contained in:
towards-a-new-leftypol 2024-01-19 19:05:02 -05:00
parent a78464fa81
commit 97884ee425
8 changed files with 181 additions and 23 deletions

View File

@ -76,6 +76,7 @@ executable chan-delorean
JSONCommonTypes
Common.PostsType
AttachmentType
Hash
-- LANGUAGE extensions used by modules in this package.
-- other-extensions:
@ -92,7 +93,10 @@ executable chan-delorean
http-conduit,
safe-exceptions,
http-types,
time
time,
cryptonite,
memory,
mime-types
-- Directories containing source files.
hs-source-dirs: src

View File

@ -5,7 +5,7 @@ let
f = { mkDerivation, base, stdenv, cabal-install,
aeson, safe-exceptions, bytestring, cmdargs,
http-conduit
http-conduit, cryptonite, memory, mime-types
}:
mkDerivation {
pname = "chan-delorean";
@ -15,6 +15,7 @@ let
isExecutable = true;
executableHaskellDepends = [
base safe-exceptions aeson bytestring cmdargs http-conduit
cryptonite memory mime-types
];
testHaskellDepends = [ cabal-install ];
license = "unknown";

View File

@ -10,6 +10,7 @@ WHERE opc.oid >= 16384 AND NOT amvalidate(opc.oid);
-- </bktree/sql/init.sql>
DROP TYPE IF EXISTS dimension CASCADE;
DROP TABLE IF EXISTS sites CASCADE;
DROP TABLE IF EXISTS boards CASCADE;
DROP TABLE IF EXISTS threads CASCADE;
@ -100,6 +101,11 @@ ON posts
FOR EACH ROW
EXECUTE FUNCTION update_post_body_search_index();
CREATE TYPE dimension AS
( width int
, height int
);
CREATE TABLE IF NOT EXISTS attachments
( attachment_id bigserial primary key
, mimetype text NOT NULL
@ -108,6 +114,7 @@ CREATE TABLE IF NOT EXISTS attachments
, phash bigint
, illegal boolean NOT NULL DEFAULT false
, post_id bigint NOT NULL
, resolution dimension
, CHECK
(
(mimetype NOT IN ('image/jpeg', 'image/png', 'image/gif'))

View File

@ -0,0 +1,38 @@
BEGIN TRANSACTION;
DROP TYPE IF EXISTS dimension CASCADE;
DROP TABLE IF EXISTS attachments CASCADE;
CREATE TYPE dimension AS
( width int
, height int
);
CREATE TABLE IF NOT EXISTS attachments
( attachment_id bigserial primary key
, mimetype text NOT NULL
, creation_time timestamp with time zone NOT NULL
, sha256_hash text NOT NULL UNIQUE
, phash bigint
, illegal boolean NOT NULL DEFAULT false
, post_id bigint NOT NULL
, resolution dimension
, CHECK
(
(mimetype NOT IN ('image/jpeg', 'image/png', 'image/gif'))
OR
(phash IS NOT NULL)
)
, CONSTRAINT post_fk FOREIGN KEY (post_id) REFERENCES posts (post_id) ON DELETE CASCADE
);
CREATE INDEX attachments_creation_time_idx ON attachments (creation_time);
CREATE INDEX attachments_post_id_idx ON attachments (post_id);
CREATE INDEX attachments_sha256_hash_idx ON attachments (sha256_hash);
--
-- Index using the bktree extension for quickly getting the closest phashes
CREATE INDEX attachments_phash_bktree_index ON attachments USING spgist (phash bktree_ops);
GRANT SELECT ON attachments TO chan_archive_anon;
GRANT ALL ON attachments TO chan_archiver;
COMMIT;

View File

@ -1,6 +1,8 @@
{-# LANGUAGE DeriveAnyClass #-}
module AttachmentType
( Attachment (..)
, Dimension (..)
, Paths (..)
) where
import GHC.Generics
@ -9,6 +11,16 @@ import Data.Aeson (FromJSON, ToJSON)
import Data.Text (Text)
import Data.Time.Clock (UTCTime)
data Dimension = Dimension
{ width :: Int
, height :: Int
} deriving (Show, Generic, FromJSON, ToJSON)
data Paths = Paths
{ file_path :: Text
, thumbnail_path :: Text
}
data Attachment = Attachment
{ attachment_id :: Maybe Int64
, mimetype :: Text
@ -17,4 +29,5 @@ data Attachment = Attachment
, phash :: Int64
, illegal :: Bool
, post_id :: Int64
, resolution :: Maybe Dimension
} deriving (Show, Generic, FromJSON, ToJSON)

View File

@ -20,6 +20,9 @@ import Data.Time.Clock.POSIX (posixSecondsToUTCTime)
import Data.Time.Clock (UTCTime)
import qualified Data.Map as Map
import Data.Maybe (fromJust)
import Data.Text (Text, unpack)
import Data.Text.Encoding (decodeUtf8)
import Network.Mime (defaultMimeLookup, defaultMimeType)
import JSONParsing
import JSONSettings
@ -29,8 +32,9 @@ import qualified DataClient as Client
import qualified SitesType as Sites
import qualified BoardsType as Boards
import qualified ThreadType as Threads
import qualified AttachmentType as Attachments
import qualified AttachmentType as At
import qualified Common.PostsType as Posts
import qualified Hash as Hash
newtype SettingsCLI = SettingsCLI
{ jsonFile :: FilePath
@ -228,18 +232,29 @@ setPostIdInPosts post_pairs ids = map f ids
(\(i, j) -> (i, j { Posts.post_id = Just asdf1 })) (post_map Map.! (asdf2, asdf3))
fileToAttachment :: Posts.Post -> JS.File -> Attachments.Attachment
fileToAttachment :: Posts.Post -> JS.File -> At.Attachment
fileToAttachment post file =
Attachments.Attachment
{ Attachments.attachment_id = Nothing
, Attachments.mimetype = maybe "undefined/undefined" id (JS.mime file)
, Attachments.creation_time = Posts.creation_time post
, Attachments.sha256_hash = undefined
, Attachments.phash = undefined
, Attachments.illegal = False
, Attachments.post_id = fromJust $ Posts.post_id post
At.Attachment
{ At.attachment_id = Nothing
, At.mimetype = maybe "undefined/undefined" id (JS.mime file)
, At.creation_time = Posts.creation_time post
, At.sha256_hash = undefined
, At.phash = undefined
, At.illegal = False
, At.post_id = fromJust $ Posts.post_id post
, At.resolution = dim
}
where
dim = (JS.w file) >>= \w ->
((JS.h file) >>= \h ->
Just $ At.Dimension w h)
getMimeType :: Text -> Text
getMimeType ext = decodeUtf8 $ defaultMimeLookup ext
processFiles :: JSONSettings -> [(JSONPosts.Post, Posts.Post)] -> IO ()
processFiles settings post_pairs = do -- perfect just means that our posts have ids, they're already inserted into the db
existing_attachments_result <- Client.getAttachments settings (map (fromJust . Posts.post_id . snd) post_pairs)
@ -249,8 +264,8 @@ processFiles settings post_pairs = do -- perfect just means that our posts have
putStrLn $ "Error fetching boards: " ++ show err
exitFailure
Right existing_attachments -> do
let map_existing :: Map.Map Int64 [ Attachments.Attachment ] =
foldl (insertRecord Attachments.post_id) Map.empty existing_attachments
let map_existing :: Map.Map Int64 [ At.Attachment ] =
foldl (insertRecord At.post_id) Map.empty existing_attachments
-- have things like sha256 already
-- how do we know that a `elem` attachments_on_board and a `elem` existing_attachments
@ -258,22 +273,79 @@ processFiles settings post_pairs = do -- perfect just means that our posts have
-- same with attachments_on_board into `map_should_exist :: Map post_id -> Set Attachment`
--
-- then run through the keys and compare size of the sets
let attachments_on_board :: [(At.Paths, At.Attachment)] =
concatMap parseAttachments post_pairs
let attachments_on_board :: [(JS.File, Attachments.Attachment)] = concatMap
(\(p, q) -> map (\x -> (x, fileToAttachment q x)) (maybe [] id $ JSONPosts.files p))
post_pairs
let map_should_exist :: Map.Map Int64 [(JS.File, Attachments.Attachment)] =
foldl (insertRecord (Attachments.post_id . snd)) Map.empty attachments_on_board
let map_should_exist :: Map.Map Int64 [(At.Paths, At.Attachment)] =
foldl (insertRecord (At.post_id . snd)) Map.empty attachments_on_board
let to_insert_map = Map.filterWithKey (compareAttMaps map_existing) map_should_exist
let to_insert = foldr (++) [] $ Map.elems to_insert_map
have_hash <- mapM computeAttachmentHash to_insert
let existing_hashes :: Set.Set Text =
Set.fromList $ map At.sha256_hash existing_attachments
let to_insert_ = filter ((`Set.notMember` existing_hashes) . At.sha256_hash) have_hash
-- TODO: Concat all values in to_insert_map and
-- go ahead and compute sha256 and phashes on them.
-- ensure that the sha256 in to_insert_map is not in map_existing
return ()
where
computeAttachmentHash :: (At.Paths, At.Attachment) -> IO At.Attachment
computeAttachmentHash (p, q) = do
let f = backup_read_root settings <> "/" <> unpack (At.file_path p)
sha256_sum <- Hash.computeSHA256 f
return q { At.sha256_hash = sha256_sum }
computeAttachmentPhash :: (At.Paths, At.Attachment) -> IO At.Attachment
computeAttachmentPhash = undefined
parseLegacyPaths :: JSONPosts.Post -> Maybe At.Paths
parseLegacyPaths post = do
tim <- JSONPosts.tim post
ext <- JSONPosts.ext post
let board = JSONPosts.board post
let file_path = board <> "/src/" <> tim <> ext
let thumbnail_path = board <> "/thumb/" <> tim <> ext
Just $ At.Paths file_path thumbnail_path
parseAttachments :: (JSONPosts.Post, Posts.Post) -> [(At.Paths, At.Attachment)]
parseAttachments (p, q) =
case JSONPosts.files p of
Just files -> map (\x -> (At.Paths (JS.file_path x) (JS.thumb_path x), fileToAttachment q x)) files
Nothing ->
case parseLegacyPaths p of
Nothing -> []
Just paths ->
let
dim = (JSONPosts.w p) >>= \w -> ((JSONPosts.h p) >>= \h -> Just $ At.Dimension w h)
mime = maybe (decodeUtf8 defaultMimeType) getMimeType $ JSONPosts.ext p
in
( paths
, At.Attachment
{ At.attachment_id = Nothing
, At.mimetype = mime
, At.creation_time = Posts.creation_time q
, At.sha256_hash = undefined
, At.phash = undefined
, At.illegal = False
, At.post_id = fromJust $ Posts.post_id q
, At.resolution = dim
}
) : []
insertRecord
:: Ord a
=> (b -> a)
@ -286,9 +358,9 @@ processFiles settings post_pairs = do -- perfect just means that our posts have
in Map.insert pid (x : l) accMap
compareAttMaps
:: Map.Map Int64 [ Attachments.Attachment ]
:: Map.Map Int64 [ At.Attachment ]
-> Int64
-> [(JS.File, Attachments.Attachment)]
-> [(At.Paths, At.Attachment)]
-> Bool
compareAttMaps existing k v
= (maybe (-1) length (Map.lookup k existing)) /= length v

16
src/Hash.hs Normal file
View File

@ -0,0 +1,16 @@
module Hash
( computeSHA256
) where
import Crypto.Hash (hashlazy, Digest, SHA256)
import qualified Data.ByteString.Lazy as B
import qualified Data.ByteArray.Encoding as BA
import Data.Text (Text)
import qualified Data.Text.Encoding as T
-- Function to compute SHA256 hash of a file
computeSHA256 :: FilePath -> IO Text
computeSHA256 filePath = do
fileData <- B.readFile filePath
let hashDigest = hashlazy fileData :: Digest SHA256
return $ T.decodeUtf8 $ BA.convertToBase BA.Base16 hashDigest

View File

@ -22,10 +22,17 @@ data Post = Post
, locked :: Maybe Int
, cyclical :: Maybe J.Cyclical
, last_modified :: Int
, board :: String
, board :: Text
, files :: Maybe [J.File]
, resto :: Int
, unique_ips :: Maybe Int
-- legacy attributes
, filename :: Maybe Text
, h :: Maybe Int
, w :: Maybe Int
, ext :: Maybe Text
, tim :: Maybe Text
} deriving (Show, Generic)
instance FromJSON Post