WIP: Saving attachments
- handle legacy json case for attachement parsing - compute sha256 - computing phash is TODO
This commit is contained in:
parent
a78464fa81
commit
97884ee425
|
@ -76,6 +76,7 @@ executable chan-delorean
|
|||
JSONCommonTypes
|
||||
Common.PostsType
|
||||
AttachmentType
|
||||
Hash
|
||||
|
||||
-- LANGUAGE extensions used by modules in this package.
|
||||
-- other-extensions:
|
||||
|
@ -92,7 +93,10 @@ executable chan-delorean
|
|||
http-conduit,
|
||||
safe-exceptions,
|
||||
http-types,
|
||||
time
|
||||
time,
|
||||
cryptonite,
|
||||
memory,
|
||||
mime-types
|
||||
|
||||
-- Directories containing source files.
|
||||
hs-source-dirs: src
|
||||
|
|
|
@ -5,7 +5,7 @@ let
|
|||
|
||||
f = { mkDerivation, base, stdenv, cabal-install,
|
||||
aeson, safe-exceptions, bytestring, cmdargs,
|
||||
http-conduit
|
||||
http-conduit, cryptonite, memory, mime-types
|
||||
}:
|
||||
mkDerivation {
|
||||
pname = "chan-delorean";
|
||||
|
@ -15,6 +15,7 @@ let
|
|||
isExecutable = true;
|
||||
executableHaskellDepends = [
|
||||
base safe-exceptions aeson bytestring cmdargs http-conduit
|
||||
cryptonite memory mime-types
|
||||
];
|
||||
testHaskellDepends = [ cabal-install ];
|
||||
license = "unknown";
|
||||
|
|
|
@ -10,6 +10,7 @@ WHERE opc.oid >= 16384 AND NOT amvalidate(opc.oid);
|
|||
|
||||
-- </bktree/sql/init.sql>
|
||||
|
||||
DROP TYPE IF EXISTS dimension CASCADE;
|
||||
DROP TABLE IF EXISTS sites CASCADE;
|
||||
DROP TABLE IF EXISTS boards CASCADE;
|
||||
DROP TABLE IF EXISTS threads CASCADE;
|
||||
|
@ -100,6 +101,11 @@ ON posts
|
|||
FOR EACH ROW
|
||||
EXECUTE FUNCTION update_post_body_search_index();
|
||||
|
||||
CREATE TYPE dimension AS
|
||||
( width int
|
||||
, height int
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS attachments
|
||||
( attachment_id bigserial primary key
|
||||
, mimetype text NOT NULL
|
||||
|
@ -108,6 +114,7 @@ CREATE TABLE IF NOT EXISTS attachments
|
|||
, phash bigint
|
||||
, illegal boolean NOT NULL DEFAULT false
|
||||
, post_id bigint NOT NULL
|
||||
, resolution dimension
|
||||
, CHECK
|
||||
(
|
||||
(mimetype NOT IN ('image/jpeg', 'image/png', 'image/gif'))
|
||||
|
|
|
@ -0,0 +1,38 @@
|
|||
BEGIN TRANSACTION;
|
||||
|
||||
DROP TYPE IF EXISTS dimension CASCADE;
|
||||
DROP TABLE IF EXISTS attachments CASCADE;
|
||||
|
||||
CREATE TYPE dimension AS
|
||||
( width int
|
||||
, height int
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS attachments
|
||||
( attachment_id bigserial primary key
|
||||
, mimetype text NOT NULL
|
||||
, creation_time timestamp with time zone NOT NULL
|
||||
, sha256_hash text NOT NULL UNIQUE
|
||||
, phash bigint
|
||||
, illegal boolean NOT NULL DEFAULT false
|
||||
, post_id bigint NOT NULL
|
||||
, resolution dimension
|
||||
, CHECK
|
||||
(
|
||||
(mimetype NOT IN ('image/jpeg', 'image/png', 'image/gif'))
|
||||
OR
|
||||
(phash IS NOT NULL)
|
||||
)
|
||||
, CONSTRAINT post_fk FOREIGN KEY (post_id) REFERENCES posts (post_id) ON DELETE CASCADE
|
||||
);
|
||||
CREATE INDEX attachments_creation_time_idx ON attachments (creation_time);
|
||||
CREATE INDEX attachments_post_id_idx ON attachments (post_id);
|
||||
CREATE INDEX attachments_sha256_hash_idx ON attachments (sha256_hash);
|
||||
--
|
||||
-- Index using the bktree extension for quickly getting the closest phashes
|
||||
CREATE INDEX attachments_phash_bktree_index ON attachments USING spgist (phash bktree_ops);
|
||||
|
||||
GRANT SELECT ON attachments TO chan_archive_anon;
|
||||
GRANT ALL ON attachments TO chan_archiver;
|
||||
|
||||
COMMIT;
|
|
@ -1,6 +1,8 @@
|
|||
{-# LANGUAGE DeriveAnyClass #-}
|
||||
module AttachmentType
|
||||
( Attachment (..)
|
||||
, Dimension (..)
|
||||
, Paths (..)
|
||||
) where
|
||||
|
||||
import GHC.Generics
|
||||
|
@ -9,6 +11,16 @@ import Data.Aeson (FromJSON, ToJSON)
|
|||
import Data.Text (Text)
|
||||
import Data.Time.Clock (UTCTime)
|
||||
|
||||
data Dimension = Dimension
|
||||
{ width :: Int
|
||||
, height :: Int
|
||||
} deriving (Show, Generic, FromJSON, ToJSON)
|
||||
|
||||
data Paths = Paths
|
||||
{ file_path :: Text
|
||||
, thumbnail_path :: Text
|
||||
}
|
||||
|
||||
data Attachment = Attachment
|
||||
{ attachment_id :: Maybe Int64
|
||||
, mimetype :: Text
|
||||
|
@ -17,4 +29,5 @@ data Attachment = Attachment
|
|||
, phash :: Int64
|
||||
, illegal :: Bool
|
||||
, post_id :: Int64
|
||||
, resolution :: Maybe Dimension
|
||||
} deriving (Show, Generic, FromJSON, ToJSON)
|
||||
|
|
112
src/Backfill.hs
112
src/Backfill.hs
|
@ -20,6 +20,9 @@ import Data.Time.Clock.POSIX (posixSecondsToUTCTime)
|
|||
import Data.Time.Clock (UTCTime)
|
||||
import qualified Data.Map as Map
|
||||
import Data.Maybe (fromJust)
|
||||
import Data.Text (Text, unpack)
|
||||
import Data.Text.Encoding (decodeUtf8)
|
||||
import Network.Mime (defaultMimeLookup, defaultMimeType)
|
||||
|
||||
import JSONParsing
|
||||
import JSONSettings
|
||||
|
@ -29,8 +32,9 @@ import qualified DataClient as Client
|
|||
import qualified SitesType as Sites
|
||||
import qualified BoardsType as Boards
|
||||
import qualified ThreadType as Threads
|
||||
import qualified AttachmentType as Attachments
|
||||
import qualified AttachmentType as At
|
||||
import qualified Common.PostsType as Posts
|
||||
import qualified Hash as Hash
|
||||
|
||||
newtype SettingsCLI = SettingsCLI
|
||||
{ jsonFile :: FilePath
|
||||
|
@ -228,18 +232,29 @@ setPostIdInPosts post_pairs ids = map f ids
|
|||
(\(i, j) -> (i, j { Posts.post_id = Just asdf1 })) (post_map Map.! (asdf2, asdf3))
|
||||
|
||||
|
||||
fileToAttachment :: Posts.Post -> JS.File -> Attachments.Attachment
|
||||
fileToAttachment :: Posts.Post -> JS.File -> At.Attachment
|
||||
fileToAttachment post file =
|
||||
Attachments.Attachment
|
||||
{ Attachments.attachment_id = Nothing
|
||||
, Attachments.mimetype = maybe "undefined/undefined" id (JS.mime file)
|
||||
, Attachments.creation_time = Posts.creation_time post
|
||||
, Attachments.sha256_hash = undefined
|
||||
, Attachments.phash = undefined
|
||||
, Attachments.illegal = False
|
||||
, Attachments.post_id = fromJust $ Posts.post_id post
|
||||
At.Attachment
|
||||
{ At.attachment_id = Nothing
|
||||
, At.mimetype = maybe "undefined/undefined" id (JS.mime file)
|
||||
, At.creation_time = Posts.creation_time post
|
||||
, At.sha256_hash = undefined
|
||||
, At.phash = undefined
|
||||
, At.illegal = False
|
||||
, At.post_id = fromJust $ Posts.post_id post
|
||||
, At.resolution = dim
|
||||
}
|
||||
|
||||
where
|
||||
dim = (JS.w file) >>= \w ->
|
||||
((JS.h file) >>= \h ->
|
||||
Just $ At.Dimension w h)
|
||||
|
||||
|
||||
getMimeType :: Text -> Text
|
||||
getMimeType ext = decodeUtf8 $ defaultMimeLookup ext
|
||||
|
||||
|
||||
processFiles :: JSONSettings -> [(JSONPosts.Post, Posts.Post)] -> IO ()
|
||||
processFiles settings post_pairs = do -- perfect just means that our posts have ids, they're already inserted into the db
|
||||
existing_attachments_result <- Client.getAttachments settings (map (fromJust . Posts.post_id . snd) post_pairs)
|
||||
|
@ -249,8 +264,8 @@ processFiles settings post_pairs = do -- perfect just means that our posts have
|
|||
putStrLn $ "Error fetching boards: " ++ show err
|
||||
exitFailure
|
||||
Right existing_attachments -> do
|
||||
let map_existing :: Map.Map Int64 [ Attachments.Attachment ] =
|
||||
foldl (insertRecord Attachments.post_id) Map.empty existing_attachments
|
||||
let map_existing :: Map.Map Int64 [ At.Attachment ] =
|
||||
foldl (insertRecord At.post_id) Map.empty existing_attachments
|
||||
|
||||
-- have things like sha256 already
|
||||
-- how do we know that a `elem` attachments_on_board and a `elem` existing_attachments
|
||||
|
@ -258,22 +273,79 @@ processFiles settings post_pairs = do -- perfect just means that our posts have
|
|||
-- same with attachments_on_board into `map_should_exist :: Map post_id -> Set Attachment`
|
||||
--
|
||||
-- then run through the keys and compare size of the sets
|
||||
let attachments_on_board :: [(At.Paths, At.Attachment)] =
|
||||
concatMap parseAttachments post_pairs
|
||||
|
||||
let attachments_on_board :: [(JS.File, Attachments.Attachment)] = concatMap
|
||||
(\(p, q) -> map (\x -> (x, fileToAttachment q x)) (maybe [] id $ JSONPosts.files p))
|
||||
post_pairs
|
||||
|
||||
let map_should_exist :: Map.Map Int64 [(JS.File, Attachments.Attachment)] =
|
||||
foldl (insertRecord (Attachments.post_id . snd)) Map.empty attachments_on_board
|
||||
let map_should_exist :: Map.Map Int64 [(At.Paths, At.Attachment)] =
|
||||
foldl (insertRecord (At.post_id . snd)) Map.empty attachments_on_board
|
||||
|
||||
let to_insert_map = Map.filterWithKey (compareAttMaps map_existing) map_should_exist
|
||||
|
||||
let to_insert = foldr (++) [] $ Map.elems to_insert_map
|
||||
|
||||
have_hash <- mapM computeAttachmentHash to_insert
|
||||
|
||||
let existing_hashes :: Set.Set Text =
|
||||
Set.fromList $ map At.sha256_hash existing_attachments
|
||||
|
||||
let to_insert_ = filter ((`Set.notMember` existing_hashes) . At.sha256_hash) have_hash
|
||||
|
||||
-- TODO: Concat all values in to_insert_map and
|
||||
-- go ahead and compute sha256 and phashes on them.
|
||||
-- ensure that the sha256 in to_insert_map is not in map_existing
|
||||
|
||||
return ()
|
||||
|
||||
where
|
||||
computeAttachmentHash :: (At.Paths, At.Attachment) -> IO At.Attachment
|
||||
computeAttachmentHash (p, q) = do
|
||||
let f = backup_read_root settings <> "/" <> unpack (At.file_path p)
|
||||
sha256_sum <- Hash.computeSHA256 f
|
||||
|
||||
return q { At.sha256_hash = sha256_sum }
|
||||
|
||||
computeAttachmentPhash :: (At.Paths, At.Attachment) -> IO At.Attachment
|
||||
computeAttachmentPhash = undefined
|
||||
|
||||
parseLegacyPaths :: JSONPosts.Post -> Maybe At.Paths
|
||||
parseLegacyPaths post = do
|
||||
tim <- JSONPosts.tim post
|
||||
|
||||
ext <- JSONPosts.ext post
|
||||
|
||||
let board = JSONPosts.board post
|
||||
|
||||
let file_path = board <> "/src/" <> tim <> ext
|
||||
let thumbnail_path = board <> "/thumb/" <> tim <> ext
|
||||
|
||||
Just $ At.Paths file_path thumbnail_path
|
||||
|
||||
|
||||
parseAttachments :: (JSONPosts.Post, Posts.Post) -> [(At.Paths, At.Attachment)]
|
||||
parseAttachments (p, q) =
|
||||
case JSONPosts.files p of
|
||||
Just files -> map (\x -> (At.Paths (JS.file_path x) (JS.thumb_path x), fileToAttachment q x)) files
|
||||
Nothing ->
|
||||
case parseLegacyPaths p of
|
||||
Nothing -> []
|
||||
Just paths ->
|
||||
let
|
||||
dim = (JSONPosts.w p) >>= \w -> ((JSONPosts.h p) >>= \h -> Just $ At.Dimension w h)
|
||||
mime = maybe (decodeUtf8 defaultMimeType) getMimeType $ JSONPosts.ext p
|
||||
in
|
||||
( paths
|
||||
, At.Attachment
|
||||
{ At.attachment_id = Nothing
|
||||
, At.mimetype = mime
|
||||
, At.creation_time = Posts.creation_time q
|
||||
, At.sha256_hash = undefined
|
||||
, At.phash = undefined
|
||||
, At.illegal = False
|
||||
, At.post_id = fromJust $ Posts.post_id q
|
||||
, At.resolution = dim
|
||||
}
|
||||
) : []
|
||||
|
||||
insertRecord
|
||||
:: Ord a
|
||||
=> (b -> a)
|
||||
|
@ -286,9 +358,9 @@ processFiles settings post_pairs = do -- perfect just means that our posts have
|
|||
in Map.insert pid (x : l) accMap
|
||||
|
||||
compareAttMaps
|
||||
:: Map.Map Int64 [ Attachments.Attachment ]
|
||||
:: Map.Map Int64 [ At.Attachment ]
|
||||
-> Int64
|
||||
-> [(JS.File, Attachments.Attachment)]
|
||||
-> [(At.Paths, At.Attachment)]
|
||||
-> Bool
|
||||
compareAttMaps existing k v
|
||||
= (maybe (-1) length (Map.lookup k existing)) /= length v
|
||||
|
|
|
@ -0,0 +1,16 @@
|
|||
module Hash
|
||||
( computeSHA256
|
||||
) where
|
||||
|
||||
import Crypto.Hash (hashlazy, Digest, SHA256)
|
||||
import qualified Data.ByteString.Lazy as B
|
||||
import qualified Data.ByteArray.Encoding as BA
|
||||
import Data.Text (Text)
|
||||
import qualified Data.Text.Encoding as T
|
||||
|
||||
-- Function to compute SHA256 hash of a file
|
||||
computeSHA256 :: FilePath -> IO Text
|
||||
computeSHA256 filePath = do
|
||||
fileData <- B.readFile filePath
|
||||
let hashDigest = hashlazy fileData :: Digest SHA256
|
||||
return $ T.decodeUtf8 $ BA.convertToBase BA.Base16 hashDigest
|
|
@ -22,10 +22,17 @@ data Post = Post
|
|||
, locked :: Maybe Int
|
||||
, cyclical :: Maybe J.Cyclical
|
||||
, last_modified :: Int
|
||||
, board :: String
|
||||
, board :: Text
|
||||
, files :: Maybe [J.File]
|
||||
, resto :: Int
|
||||
, unique_ips :: Maybe Int
|
||||
|
||||
-- legacy attributes
|
||||
, filename :: Maybe Text
|
||||
, h :: Maybe Int
|
||||
, w :: Maybe Int
|
||||
, ext :: Maybe Text
|
||||
, tim :: Maybe Text
|
||||
} deriving (Show, Generic)
|
||||
|
||||
instance FromJSON Post
|
||||
|
|
Loading…
Reference in New Issue