Move everything except for main method from Backfill into Lib (for reuse by consumer Main module)

This commit is contained in:
towards-a-new-leftypol 2024-04-05 01:31:26 -04:00
parent 2588724b8c
commit 0086dab7f8
4 changed files with 681 additions and 650 deletions

View File

@ -1,657 +1,12 @@
{-# OPTIONS_GHC -Wno-unrecognised-pragmas #-}
{-# HLINT ignore "Redundant bracket" #-}
{-# HLINT ignore "Use fromMaybe" #-}
{-# LANGUAGE OverloadedStrings #-}
module Main where module Main where
import System.Exit import System.Exit
import Data.Int (Int64)
import Control.Monad (filterM)
import Data.Aeson (decode)
import qualified Data.ByteString.Lazy as B import qualified Data.ByteString.Lazy as B
import Data.Aeson (decode)
import System.Console.CmdArgs import System.Console.CmdArgs
import System.Directory
( listDirectory
, doesFileExist
, copyFile
, createDirectoryIfMissing
)
import System.FilePath ((</>), (<.>), takeExtension)
import Data.List (find, isSuffixOf, foldl', sortBy)
import Data.Ord (comparing)
import qualified Data.Set as Set
import Data.Set (Set)
import Data.Time.Clock.POSIX (posixSecondsToUTCTime)
import Data.Time.Clock (UTCTime)
import qualified Data.Map as Map
import Data.Maybe (fromJust)
import Data.Text (Text, unpack)
import qualified Data.Text as T
import Data.Text.Encoding (decodeUtf8)
import Network.Mime (defaultMimeLookup)
import PerceptualHash (fileHash)
import JSONParsing
import Common.Server.JSONSettings import Common.Server.JSONSettings
import qualified JSONCommonTypes as JS import Lib
import qualified JSONPost as JSONPosts
import qualified Network.DataClient as Client
import qualified SitesType as Sites
import qualified BoardsType as Boards
import qualified ThreadType as Threads
import qualified Common.AttachmentType as At
import qualified Common.PostsType as Posts
import qualified Hash as Hash
import qualified Data.WordUtil as Words
newtype SettingsCLI = SettingsCLI
{ jsonFile :: FilePath
} deriving (Show, Data, Typeable)
listCatalogDirectories :: JSONSettings -> IO [ FilePath ]
listCatalogDirectories settings = do
allDirs <- listDirectory (backup_read_root settings)
let filteredDirs = filter (`notElem` excludedDirs) allDirs
filterM hasCatalog filteredDirs
where
excludedDirs = ["sfw", "alt", "overboard"]
hasCatalog dir = do
let catalogPath = backup_read_root settings </> dir </> "catalog.json"
doesFileExist catalogPath
ensureSiteExists :: JSONSettings -> IO Sites.Site
ensureSiteExists settings = do
sitesResult <- Client.getAllSites settings
case sitesResult of
Right siteList ->
case find (\site -> Sites.name site == site_name settings) siteList of
Just site -> do
putStrLn $ site_name settings ++ " already exists!"
return site
Nothing -> do
putStrLn $ site_name settings ++ " does not exist. Creating..."
postResult <- Client.postSite settings
case postResult of
Right (site:_) -> do
putStrLn $ "Successfully created " ++ site_name settings ++ ". " ++ show site
return site
Right [] -> do
putStrLn "Did not get new site id back from postgrest"
exitFailure
Left err -> do
putStrLn $ "Failed to create " ++ site_name settings
++ " Error: " ++ show err
exitFailure
Left err -> do
putStrLn $ "Error fetching sites: " ++ show err
exitFailure
createArchivesForNewBoards
:: JSONSettings
-> Set String
-> [ String ]
-> Int
-> IO [ Boards.Board ]
createArchivesForNewBoards settings dirsSet archived_boards siteid = do
let archivedBoardsSet = Set.fromList archived_boards
-- Find boards that are in dirs but not in archived_boards
let boardsToArchive = dirsSet `Set.difference` archivedBoardsSet
putStrLn "Creating boards:"
mapM_ putStrLn boardsToArchive
post_result <- Client.postBoards settings (Set.toList boardsToArchive) siteid
case post_result of
Left err -> do
putStrLn $ "Error posting boards: " ++ show err
exitFailure
Right boards -> do
putStrLn "Created the following boards:"
mapM_ (putStrLn . Boards.pathpart) boards
return boards
apiThreadToArchiveThread :: Int -> Thread -> Threads.Thread
apiThreadToArchiveThread board_id_ json_thread =
Threads.Thread
{ Threads.thread_id = undefined
, Threads.board_thread_id = no json_thread
, Threads.creation_time = epochToUTCTime $ fromIntegral (time json_thread)
, Threads.board_id = board_id_
}
epochToUTCTime :: Int -> UTCTime
epochToUTCTime = posixSecondsToUTCTime . realToFrac
createArchivesForNewThreads
:: JSONSettings
-> [ Thread ]
-> [ Threads.Thread ]
-> Boards.Board
-> IO [ Threads.Thread ]
createArchivesForNewThreads settings all_threads archived_threads board = do
putStrLn $ "Creating " ++ show (length threads_to_create) ++ " threads."
threads_result <- Client.postThreads settings (map (apiThreadToArchiveThread board_id) threads_to_create)
case threads_result of
Left err -> do
putStrLn $ "Error creating threads: " ++ show err
exitFailure
Right new_threads -> return new_threads
where
board_id :: Int = Boards.board_id board
archived_board_thread_ids :: Set.Set Int
archived_board_thread_ids =
Set.fromList $ map Threads.board_thread_id archived_threads
threads_to_create :: [ Thread ]
threads_to_create =
filter
((`Set.notMember` archived_board_thread_ids) . no)
all_threads
ensureThreads :: JSONSettings -> Boards.Board -> [ Thread ] -> IO [ Threads.Thread ]
ensureThreads settings board all_threads = do
threads_result <- Client.getThreads settings (Boards.board_id board) (map no all_threads)
case threads_result of
Left err -> do
putStrLn $ "Error fetching threads: " ++ show err
exitFailure
Right archived_threads -> do
putStrLn $ show (length archived_threads) ++ " threads already exist."
new_threads <- createArchivesForNewThreads settings all_threads archived_threads board
return $ archived_threads ++ new_threads
readPosts
:: JSONSettings
-> Boards.Board
-> Threads.Thread
-> IO (Threads.Thread, [ JSONPosts.Post ])
readPosts settings board thread = do
result <- parsePosts thread_filename
case result of
Left err -> do
putStrLn $ "Failed to parse the JSON file " ++ thread_filename ++ " error: " ++ err
return (thread, [])
Right posts_wrapper -> return (thread, JSONPosts.posts posts_wrapper)
where
thread_filename :: FilePath
thread_filename = backupDir </> "res" </> (show (Threads.board_thread_id thread) ++ ".json")
backupDir :: FilePath
backupDir = backup_read_root settings </> Boards.pathpart board
apiPostToPostKey :: Threads.Thread -> JSONPosts.Post -> Client.PostId
apiPostToPostKey thread post =
Client.PostId
{ Client.thread_id = (Threads.thread_id thread)
, Client.board_post_id = (JSONPosts.no post)
}
-- Convert Post to DbPost
apiPostToArchivePost :: Int -> Threads.Thread -> JSONPosts.Post -> Posts.Post
apiPostToArchivePost local_idx thread post =
Posts.Post
{ Posts.post_id = Nothing
, Posts.board_post_id = JSONPosts.no post
, Posts.creation_time = posixSecondsToUTCTime (realToFrac $ JSONPosts.time post)
, Posts.body = JSONPosts.com post
, Posts.name = JSONPosts.name post
, Posts.subject = JSONPosts.sub post
, Posts.email = JSONPosts.email post
, Posts.thread_id = Threads.thread_id thread
, Posts.embed = JSONPosts.embed post
, Posts.local_idx = local_idx
}
-- | A version of 'concatMap' that works with a monadic predicate.
-- Stolen from package extra Control.Monad.Extra
concatMapM :: Monad m => (a -> m [b]) -> [a] -> m [b]
{-# INLINE concatMapM #-}
concatMapM op = foldr f (pure [])
where f x xs = do
x_ <- op x
if null x_
then xs
else do
xs_ <- xs
pure $ x_ ++ xs_
addPostsToTuples
:: [(Sites.Site, Boards.Board, Threads.Thread, JSONPosts.Post)]
-> [ Posts.Post ]
-> [(Sites.Site, Boards.Board, Threads.Thread, JSONPosts.Post, Posts.Post)]
addPostsToTuples tuples posts = map f posts
where
post_map :: Map.Map (Int64, Int64) (Sites.Site, Boards.Board, Threads.Thread, JSONPosts.Post)
post_map = Map.fromList (map (\(a, b, c, d) -> ((Threads.thread_id c, JSONPosts.no d), (a, b, c, d))) tuples)
f :: Posts.Post -> (Sites.Site, Boards.Board, Threads.Thread, JSONPosts.Post, Posts.Post)
f new_post =
(\(a, b, c, d) -> (a, b, c, d, new_post))
(post_map Map.! (Posts.thread_id new_post, Posts.board_post_id new_post))
fileToAttachment :: Int -> Posts.Post -> JS.File -> At.Attachment
fileToAttachment i post file =
At.Attachment
{ At.mimetype = maybe guessed_mime id (JS.mime file)
, At.creation_time = Posts.creation_time post
, At.sha256_hash = undefined
, At.phash = Nothing
, At.illegal = False
, At.post_id = fromJust $ Posts.post_id post
, At.resolution = dim
, At.file_extension = Just extension
, At.thumb_extension = Just thumb_extension
, At.original_filename = Just $ JS.filename file <> "." <> extension
, At.file_size_bytes = JS.fsize file
, At.board_filename = JS.id file
, At.spoiler = maybe False id $ JS.spoiler file
, At.attachment_idx = i
}
where
extension = JS.ext file
thumb_extension = T.pack $ drop 1 $ takeExtension $ unpack $ JS.thumb_path file
guessed_mime = getMimeType extension
dim = (JS.w file) >>= \w ->
((JS.h file) >>= \h ->
Just $ At.Dimension w h)
getMimeType :: Text -> Text
getMimeType ext = decodeUtf8 $ defaultMimeLookup ext
phash_mimetypes :: Set.Set Text
phash_mimetypes = Set.fromList
[ "image/jpeg"
, "image/png"
, "image/gif"
]
copyFiles :: JSONSettings -> (Sites.Site, Boards.Board, Threads.Thread, Posts.Post, At.Paths, At.Attachment) -> IO ()
copyFiles settings (site, board, thread, _, path, attachment) = do
destination_exists <- doesFileExist dest
if not destination_exists
then do
src_exists <- doesFileExist src
createDirectoryIfMissing True common_dest
if src_exists
then putStrLn ("Copying " ++ src) >> copyFile src dest
else return ()
thumb_exists <- doesFileExist thumb_src
if thumb_exists
then putStrLn ("Copying " ++ thumb_src) >> copyFile thumb_src thumb_dest
else return ()
else return ()
-- src = (At.file_path | At.thumb_path)
-- dest = <media_root>/<website_name>/<boardpart>/<board_thread_id>/<sha>.<ext>
where
src :: FilePath
src = At.file_path path
thumb_src :: FilePath
thumb_src = At.thumbnail_path path
dest :: FilePath
dest = common_dest
</> (unpack $ At.board_filename attachment)
<.> (unpack $ fromJust $ At.file_extension attachment)
thumb_dest :: FilePath
thumb_dest = common_dest
</> "thumbnail_" <> (unpack $ At.board_filename attachment)
<.> (unpack $ fromJust $ At.thumb_extension attachment)
common_dest :: FilePath
common_dest
= (media_root_path settings)
</> Sites.name site
</> Boards.pathpart board
</> (show $ Threads.board_thread_id thread)
processFiles :: JSONSettings -> [(Sites.Site, Boards.Board, Threads.Thread, JSONPosts.Post, Posts.Post)] -> IO ()
processFiles settings tuples = do -- perfect just means that our posts have ids, they're already inserted into the db
let ps = map (\(_, _, _, _, x) -> x) tuples
existing_attachments_result <- Client.getAttachments settings (map (fromJust . Posts.post_id) ps)
case existing_attachments_result of
Left err -> do
putStrLn $ "Error fetching attachments: " ++ show err
exitFailure
Right existing_attachments -> do
let map_existing :: Map.Map (Int64, Text) [ At.Attachment ] =
foldl'
(insertRecord (\a -> (At.post_id a, At.board_filename a)))
Map.empty
existing_attachments
let attachments_on_board :: [(Sites.Site, Boards.Board, Threads.Thread, Posts.Post, At.Paths, At.Attachment)] =
concatMap parseAttachments tuples
-- attachments_on_board are the only files that can be copied into the archive dir right now
-- since that's where we have the src filename. except here the Attachment doesn't have a sha hash yet
-- so we can't build the destination filename.
let map_should_exist :: Map.Map (Int64, Text) [(Sites.Site, Boards.Board, Threads.Thread, Posts.Post, At.Paths, At.Attachment)] =
foldl'
(insertRecord (\(_, _, _, _, _, a) -> (At.post_id a, At.board_filename a)))
Map.empty
attachments_on_board
let to_insert_map =
Map.filterWithKey
(\k _ -> not $ k `Map.member` map_existing)
map_should_exist
let to_insert = foldr (++) [] $ Map.elems to_insert_map
to_insert_exist <- filterM attachmentFileExists to_insert
with_hashes <- mapM computeAttachmentHash to_insert_exist
attachments_result <- Client.postAttachments settings with_hashes
case attachments_result of
Left err -> do
putStrLn $ "Error posting attachments: " ++ show err
exitFailure
Right saved -> do
putStrLn $ "Saved " ++ (show $ length saved) ++ " attachments!"
mapM_ (copyFiles settings) attachments_on_board
where
attachmentFileExists :: (Sites.Site, Boards.Board, Threads.Thread, Posts.Post, At.Paths, At.Attachment) -> IO Bool
attachmentFileExists (_, _, _, _, p, _) = doesFileExist (At.file_path p)
computeAttachmentHash :: (Sites.Site, Boards.Board, Threads.Thread, Posts.Post, At.Paths, At.Attachment) -> IO At.Attachment
computeAttachmentHash (_, _, _, _, p, q) = do
let f = At.file_path p
putStrLn $ "Reading " ++ f
-- putStrLn $ show p
-- putStrLn $ show (q { At.sha256_hash = "undefined" })
sha256_sum <- Hash.computeSHA256 f
putStrLn $ "SHA-256: " ++ unpack sha256_sum
phash :: Maybe Int64 <-
case (At.mimetype q) `Set.member` phash_mimetypes of
True -> do
either_phash <- fileHash f
case either_phash of
Left err_str -> do
putStrLn $ "Failed to compute phash for file " ++ (unpack sha256_sum) ++ " " ++ f ++ " " ++ err_str
return Nothing
Right phash_w -> do
let phash_i = Words.wordToSignedInt64 phash_w
if phash_i == 0 then do
putStrLn $ "phash is 0 for file " ++ (unpack sha256_sum) ++ " " ++ f
return Nothing
else do
putStrLn $ "phash: " ++ show phash_w
return $ Just $ Words.wordToSignedInt64 phash_w
False -> return Nothing
return q
{ At.sha256_hash = sha256_sum
, At.phash = phash
}
parseLegacyPaths :: JSONPosts.Post -> Maybe (At.Paths, At.Attachment)
parseLegacyPaths post = do
tim <- JSONPosts.tim post
ext <- JSONPosts.ext post
filename <- JSONPosts.filename post
size <- JSONPosts.fsize post
spoiler <- JSONPosts.fsize post
let
board = JSONPosts.board post
file_path = withPathPrefix $ board <> "/src/" <> tim <> ext
thumbnail_path = withPathPrefix $ board <> "/thumb/" <> tim <> ext
p = At.Paths file_path thumbnail_path
mime = getMimeType ext
attachment = At.Attachment
{ At.mimetype = mime
, At.creation_time = undefined
, At.sha256_hash = undefined
, At.phash = Nothing
, At.illegal = False
, At.post_id = undefined
, At.resolution = undefined
, At.file_extension = Just $ T.drop 1 ext
, At.thumb_extension = Just $ "png"
, At.original_filename = Just $ filename <> ext
, At.file_size_bytes = size
, At.board_filename = tim
, At.spoiler = spoiler > 0
, At.attachment_idx = 1
}
return (p, attachment)
notDeleted :: (a, b, c, d, At.Paths, At.Attachment) -> Bool
notDeleted (_, _, _, _, p, _) = not $ "deleted" `isSuffixOf` (At.file_path p)
withPathPrefix :: Text -> FilePath
withPathPrefix = ((<>) $ backup_read_root settings) . unpack
parseAttachments
:: (Sites.Site, Boards.Board, Threads.Thread, JSONPosts.Post, Posts.Post)
-> [(Sites.Site, Boards.Board, Threads.Thread, Posts.Post, At.Paths, At.Attachment)]
parseAttachments (site, board, thread, p, q) = filter notDeleted $
case JSONPosts.files p of
Just files -> map
(\(i, x) ->
( site
, board
, thread
, q
, At.Paths (withPathPrefix $ JS.file_path x) (withPathPrefix $ JS.thumb_path x)
, fileToAttachment i q x
)
) (zip [1..] files)
Nothing ->
case parseLegacyPaths p of
Nothing -> []
Just (paths, a) ->
let
dim = (JSONPosts.w p) >>= \w -> ((JSONPosts.h p) >>= \h -> Just $ At.Dimension w h)
in
( site
, board
, thread
, q
, paths
, a
{ At.creation_time = Posts.creation_time q
, At.resolution = dim
, At.post_id = fromJust $ Posts.post_id q
}
) : []
insertRecord
:: Ord a
=> (b -> a)
-> Map.Map a [b]
-> b
-> Map.Map a [b]
insertRecord getKey accMap x =
let pid = getKey x
l = Map.findWithDefault [] pid accMap
in Map.insert pid (x : l) accMap
createNewPosts
:: JSONSettings
-> [ (Threads.Thread, JSONPosts.Post, Client.PostId) ]
-> IO [ Posts.Post ]
createNewPosts settings tuples = do
existing_post_results <- Client.getPosts settings $ map (\(_, _, c) -> c) tuples
existing_posts <- either handleError return existing_post_results
thread_max_local_idx_result <- Client.getThreadMaxLocalIdx settings thread_ids
thread_max_local_idxs <- either handleError return thread_max_local_idx_result
let existing_set :: Set (Int64, Int64) = Set.fromList (map (\x -> (Posts.thread_id x, Posts.board_post_id x)) existing_posts)
let to_insert_list :: [ (Threads.Thread, JSONPosts.Post, Client.PostId) ] =
sortBy (comparing $ \(_, _, p) -> Client.board_post_id p) $
newPosts tuples existing_set
-- Map of thread_id to the largest local_idx value (which would be the number of the last post in the thread)
let local_idx :: Map.Map Int64 Int = Map.fromList thread_max_local_idxs
let insert_posts :: [ Posts.Post ] = fst $ foldl' foldFn ([], local_idx) to_insert_list
-- posts to insert are the posts that are not in existing_posts
-- so we create a Set (thread_id, board_post_id) ✓
-- then check every tuples against the set and the ones not in the set get added to a to_insert_list ✓
-- also for every tuples we need to compute a local_idx
-- so we create a Map index_map from thread_id to local_idx ✓
-- - for existing_posts
-- - need to compare posts already in the map with another post and keep the max local_idx ✓
-- to get the new local_idx, we must order the to_insert_list by board_post_id, and look up each entry ✓
print insert_posts
posts_result <- Client.postPosts settings insert_posts
new_posts <- either handleError return posts_result
return $ existing_posts ++ new_posts
where
handleError err = print err >> exitFailure
thread_ids :: [ Int64 ]
thread_ids = Set.elems $ Set.fromList $ map (\(t, _, _) -> Threads.thread_id t) tuples
newPosts :: [(Threads.Thread, JSONPosts.Post, Client.PostId)] -> Set (Int64, Int64) -> [(Threads.Thread, JSONPosts.Post, Client.PostId)]
newPosts ts existing_set = filter (\(_, _, c) -> Set.notMember (Client.thread_id c, Client.board_post_id c) existing_set) ts
foldFn
:: ([Posts.Post], Map.Map Int64 Int)
-> (Threads.Thread, JSONPosts.Post, Client.PostId)
-> ([Posts.Post], Map.Map Int64 Int)
foldFn (posts, idx_map) (t, p, c) =
case Map.lookup thread_id idx_map of
Nothing -> (post 1 : posts, Map.insert thread_id 1 idx_map)
Just i -> (post (i + 1) : posts, Map.insert thread_id (i + 1) idx_map)
where
post :: Int -> Posts.Post
post i = apiPostToArchivePost i t p
thread_id = Client.thread_id c
processBoard :: JSONSettings -> Sites.Site -> Boards.Board -> IO ()
processBoard settings site board = do
let catalogPath = backupDir </> "catalog.json"
putStrLn $ "catalog file path: " ++ catalogPath
result <- parseJSONCatalog catalogPath
case result of
Right catalogs -> do
let threads_on_board = concatMap ((maybe [] id) . threads) catalogs
all_threads_for_board :: [ Threads.Thread ] <- ensureThreads settings board threads_on_board
all_posts_on_board :: [(Threads.Thread, [ JSONPosts.Post ])] <- mapM (readPosts settings board) all_threads_for_board
let tuples :: [(Sites.Site, Boards.Board, Threads.Thread, JSONPosts.Post)] = concatMap
(\(t, posts) -> map (\p -> (site, board, t, p)) posts)
all_posts_on_board
posts_result :: [ Posts.Post ] <- createNewPosts settings (map (\(_, _, c, d) -> (c, d, apiPostToPostKey c d)) tuples)
putStrLn "Sum of post_ids:"
print $ sum $ map (fromJust . Posts.post_id) posts_result
putStrLn "Sum of board_post_ids:"
print $ sum $ map Posts.board_post_id posts_result
let perfect_post_pairs = addPostsToTuples tuples posts_result
processFiles settings perfect_post_pairs
Left errMsg ->
putStrLn $ "Failed to parse the JSON file in directory: "
++ (Boards.pathpart board) ++ ". Error: " ++ errMsg
where
backupDir :: FilePath
backupDir = backup_read_root settings </> (Boards.pathpart board)
processBackupDirectory :: JSONSettings -> IO ()
processBackupDirectory settings = do
putStrLn "JSON successfully read!"
print settings -- print the decoded JSON settings
site :: Sites.Site <- ensureSiteExists settings
dirs <- listCatalogDirectories settings
let dirsSet = Set.fromList dirs
let site_id_ = Sites.site_id site
boards_result <- Client.getSiteBoards settings site_id_
putStrLn "Boards fetched!"
case boards_result of
Left err -> do
putStrLn $ "Error fetching boards: " ++ show err
exitFailure
Right archived_boards -> do
let boardnames = map Boards.pathpart archived_boards
created_boards <- createArchivesForNewBoards settings dirsSet boardnames site_id_
let boards :: [ Boards.Board ] = archived_boards ++ created_boards
let boards_we_have_data_for = filter (\board -> Set.member (Boards.pathpart board) dirsSet) boards
mapM_ (processBoard settings site) boards_we_have_data_for
-- TODO: detect saged threads by reading the bump time from the thread and comparing -- TODO: detect saged threads by reading the bump time from the thread and comparing
-- that time to the timestamp of the most recent post. If the post is newer -- that time to the timestamp of the most recent post. If the post is newer

@ -1 +1 @@
Subproject commit d47fbe70c6c40ad6963411d6ffbbadc1839b5b7c Subproject commit 202f0eb9616b6675e3fa011c69d8fda9028e5e59

651
src/Lib.hs Normal file
View File

@ -0,0 +1,651 @@
{-# OPTIONS_GHC -Wno-unrecognised-pragmas #-}
{-# HLINT ignore "Redundant bracket" #-}
{-# HLINT ignore "Use fromMaybe" #-}
{-# LANGUAGE OverloadedStrings #-}
module Lib where
import System.Exit
import Data.Int (Int64)
import Control.Monad (filterM)
import System.Console.CmdArgs
import System.Directory
( listDirectory
, doesFileExist
, copyFile
, createDirectoryIfMissing
)
import System.FilePath ((</>), (<.>), takeExtension)
import Data.List (find, isSuffixOf, foldl', sortBy)
import Data.Ord (comparing)
import qualified Data.Set as Set
import Data.Set (Set)
import Data.Time.Clock.POSIX (posixSecondsToUTCTime)
import Data.Time.Clock (UTCTime)
import qualified Data.Map as Map
import Data.Maybe (fromJust)
import Data.Text (Text, unpack)
import qualified Data.Text as T
import Data.Text.Encoding (decodeUtf8)
import Network.Mime (defaultMimeLookup)
import PerceptualHash (fileHash)
import JSONParsing
import Common.Server.JSONSettings
import qualified JSONCommonTypes as JS
import qualified JSONPost as JSONPosts
import qualified Network.DataClient as Client
import qualified SitesType as Sites
import qualified BoardsType as Boards
import qualified ThreadType as Threads
import qualified Common.AttachmentType as At
import qualified Common.PostsType as Posts
import qualified Hash as Hash
import qualified Data.WordUtil as Words
newtype SettingsCLI = SettingsCLI
{ jsonFile :: FilePath
} deriving (Show, Data, Typeable)
listCatalogDirectories :: JSONSettings -> IO [ FilePath ]
listCatalogDirectories settings = do
allDirs <- listDirectory (backup_read_root settings)
let filteredDirs = filter (`notElem` excludedDirs) allDirs
filterM hasCatalog filteredDirs
where
excludedDirs = ["sfw", "alt", "overboard"]
hasCatalog dir = do
let catalogPath = backup_read_root settings </> dir </> "catalog.json"
doesFileExist catalogPath
ensureSiteExists :: JSONSettings -> IO Sites.Site
ensureSiteExists settings = do
sitesResult <- Client.getAllSites settings
case sitesResult of
Right siteList ->
case find (\site -> Sites.name site == site_name settings) siteList of
Just site -> do
putStrLn $ site_name settings ++ " already exists!"
return site
Nothing -> do
putStrLn $ site_name settings ++ " does not exist. Creating..."
postResult <- Client.postSite settings
case postResult of
Right (site:_) -> do
putStrLn $ "Successfully created " ++ site_name settings ++ ". " ++ show site
return site
Right [] -> do
putStrLn "Did not get new site id back from postgrest"
exitFailure
Left err -> do
putStrLn $ "Failed to create " ++ site_name settings
++ " Error: " ++ show err
exitFailure
Left err -> do
putStrLn $ "Error fetching sites: " ++ show err
exitFailure
createArchivesForNewBoards
:: JSONSettings
-> Set String
-> [ String ]
-> Int
-> IO [ Boards.Board ]
createArchivesForNewBoards settings dirsSet archived_boards siteid = do
let archivedBoardsSet = Set.fromList archived_boards
-- Find boards that are in dirs but not in archived_boards
let boardsToArchive = dirsSet `Set.difference` archivedBoardsSet
putStrLn "Creating boards:"
mapM_ putStrLn boardsToArchive
post_result <- Client.postBoards settings (Set.toList boardsToArchive) siteid
case post_result of
Left err -> do
putStrLn $ "Error posting boards: " ++ show err
exitFailure
Right boards -> do
putStrLn "Created the following boards:"
mapM_ (putStrLn . Boards.pathpart) boards
return boards
apiThreadToArchiveThread :: Int -> Thread -> Threads.Thread
apiThreadToArchiveThread board_id_ json_thread =
Threads.Thread
{ Threads.thread_id = undefined
, Threads.board_thread_id = no json_thread
, Threads.creation_time = epochToUTCTime $ fromIntegral (time json_thread)
, Threads.board_id = board_id_
}
epochToUTCTime :: Int -> UTCTime
epochToUTCTime = posixSecondsToUTCTime . realToFrac
createArchivesForNewThreads
:: JSONSettings
-> [ Thread ]
-> [ Threads.Thread ]
-> Boards.Board
-> IO [ Threads.Thread ]
createArchivesForNewThreads settings all_threads archived_threads board = do
putStrLn $ "Creating " ++ show (length threads_to_create) ++ " threads."
threads_result <- Client.postThreads settings (map (apiThreadToArchiveThread board_id) threads_to_create)
case threads_result of
Left err -> do
putStrLn $ "Error creating threads: " ++ show err
exitFailure
Right new_threads -> return new_threads
where
board_id :: Int = Boards.board_id board
archived_board_thread_ids :: Set.Set Int
archived_board_thread_ids =
Set.fromList $ map Threads.board_thread_id archived_threads
threads_to_create :: [ Thread ]
threads_to_create =
filter
((`Set.notMember` archived_board_thread_ids) . no)
all_threads
ensureThreads :: JSONSettings -> Boards.Board -> [ Thread ] -> IO [ Threads.Thread ]
ensureThreads settings board all_threads = do
threads_result <- Client.getThreads settings (Boards.board_id board) (map no all_threads)
case threads_result of
Left err -> do
putStrLn $ "Error fetching threads: " ++ show err
exitFailure
Right archived_threads -> do
putStrLn $ show (length archived_threads) ++ " threads already exist."
new_threads <- createArchivesForNewThreads settings all_threads archived_threads board
return $ archived_threads ++ new_threads
readPosts
:: JSONSettings
-> Boards.Board
-> Threads.Thread
-> IO (Threads.Thread, [ JSONPosts.Post ])
readPosts settings board thread = do
result <- parsePosts thread_filename
case result of
Left err -> do
putStrLn $ "Failed to parse the JSON file " ++ thread_filename ++ " error: " ++ err
return (thread, [])
Right posts_wrapper -> return (thread, JSONPosts.posts posts_wrapper)
where
thread_filename :: FilePath
thread_filename = backupDir </> "res" </> (show (Threads.board_thread_id thread) ++ ".json")
backupDir :: FilePath
backupDir = backup_read_root settings </> Boards.pathpart board
apiPostToPostKey :: Threads.Thread -> JSONPosts.Post -> Client.PostId
apiPostToPostKey thread post =
Client.PostId
{ Client.thread_id = (Threads.thread_id thread)
, Client.board_post_id = (JSONPosts.no post)
}
-- Convert Post to DbPost
apiPostToArchivePost :: Int -> Threads.Thread -> JSONPosts.Post -> Posts.Post
apiPostToArchivePost local_idx thread post =
Posts.Post
{ Posts.post_id = Nothing
, Posts.board_post_id = JSONPosts.no post
, Posts.creation_time = posixSecondsToUTCTime (realToFrac $ JSONPosts.time post)
, Posts.body = JSONPosts.com post
, Posts.name = JSONPosts.name post
, Posts.subject = JSONPosts.sub post
, Posts.email = JSONPosts.email post
, Posts.thread_id = Threads.thread_id thread
, Posts.embed = JSONPosts.embed post
, Posts.local_idx = local_idx
}
-- | A version of 'concatMap' that works with a monadic predicate.
-- Stolen from package extra Control.Monad.Extra
concatMapM :: Monad m => (a -> m [b]) -> [a] -> m [b]
{-# INLINE concatMapM #-}
concatMapM op = foldr f (pure [])
where f x xs = do
x_ <- op x
if null x_
then xs
else do
xs_ <- xs
pure $ x_ ++ xs_
addPostsToTuples
:: [(Sites.Site, Boards.Board, Threads.Thread, JSONPosts.Post)]
-> [ Posts.Post ]
-> [(Sites.Site, Boards.Board, Threads.Thread, JSONPosts.Post, Posts.Post)]
addPostsToTuples tuples posts = map f posts
where
post_map :: Map.Map (Int64, Int64) (Sites.Site, Boards.Board, Threads.Thread, JSONPosts.Post)
post_map = Map.fromList (map (\(a, b, c, d) -> ((Threads.thread_id c, JSONPosts.no d), (a, b, c, d))) tuples)
f :: Posts.Post -> (Sites.Site, Boards.Board, Threads.Thread, JSONPosts.Post, Posts.Post)
f new_post =
(\(a, b, c, d) -> (a, b, c, d, new_post))
(post_map Map.! (Posts.thread_id new_post, Posts.board_post_id new_post))
fileToAttachment :: Int -> Posts.Post -> JS.File -> At.Attachment
fileToAttachment i post file =
At.Attachment
{ At.mimetype = maybe guessed_mime id (JS.mime file)
, At.creation_time = Posts.creation_time post
, At.sha256_hash = undefined
, At.phash = Nothing
, At.illegal = False
, At.post_id = fromJust $ Posts.post_id post
, At.resolution = dim
, At.file_extension = Just extension
, At.thumb_extension = Just thumb_extension
, At.original_filename = Just $ JS.filename file <> "." <> extension
, At.file_size_bytes = JS.fsize file
, At.board_filename = JS.id file
, At.spoiler = maybe False id $ JS.spoiler file
, At.attachment_idx = i
}
where
extension = JS.ext file
thumb_extension = T.pack $ drop 1 $ takeExtension $ unpack $ JS.thumb_path file
guessed_mime = getMimeType extension
dim = (JS.w file) >>= \w ->
((JS.h file) >>= \h ->
Just $ At.Dimension w h)
getMimeType :: Text -> Text
getMimeType ext = decodeUtf8 $ defaultMimeLookup ext
phash_mimetypes :: Set.Set Text
phash_mimetypes = Set.fromList
[ "image/jpeg"
, "image/png"
, "image/gif"
]
copyFiles :: JSONSettings -> (Sites.Site, Boards.Board, Threads.Thread, Posts.Post, At.Paths, At.Attachment) -> IO ()
copyFiles settings (site, board, thread, _, path, attachment) = do
destination_exists <- doesFileExist dest
if not destination_exists
then do
src_exists <- doesFileExist src
createDirectoryIfMissing True common_dest
if src_exists
then putStrLn ("Copying " ++ src) >> copyFile src dest
else return ()
thumb_exists <- doesFileExist thumb_src
if thumb_exists
then putStrLn ("Copying " ++ thumb_src) >> copyFile thumb_src thumb_dest
else return ()
else return ()
-- src = (At.file_path | At.thumb_path)
-- dest = <media_root>/<website_name>/<boardpart>/<board_thread_id>/<sha>.<ext>
where
src :: FilePath
src = At.file_path path
thumb_src :: FilePath
thumb_src = At.thumbnail_path path
dest :: FilePath
dest = common_dest
</> (unpack $ At.board_filename attachment)
<.> (unpack $ fromJust $ At.file_extension attachment)
thumb_dest :: FilePath
thumb_dest = common_dest
</> "thumbnail_" <> (unpack $ At.board_filename attachment)
<.> (unpack $ fromJust $ At.thumb_extension attachment)
common_dest :: FilePath
common_dest
= (media_root_path settings)
</> Sites.name site
</> Boards.pathpart board
</> (show $ Threads.board_thread_id thread)
processFiles :: JSONSettings -> [(Sites.Site, Boards.Board, Threads.Thread, JSONPosts.Post, Posts.Post)] -> IO ()
processFiles settings tuples = do -- perfect just means that our posts have ids, they're already inserted into the db
let ps = map (\(_, _, _, _, x) -> x) tuples
existing_attachments_result <- Client.getAttachments settings (map (fromJust . Posts.post_id) ps)
case existing_attachments_result of
Left err -> do
putStrLn $ "Error fetching attachments: " ++ show err
exitFailure
Right existing_attachments -> do
let map_existing :: Map.Map (Int64, Text) [ At.Attachment ] =
foldl'
(insertRecord (\a -> (At.post_id a, At.board_filename a)))
Map.empty
existing_attachments
let attachments_on_board :: [(Sites.Site, Boards.Board, Threads.Thread, Posts.Post, At.Paths, At.Attachment)] =
concatMap parseAttachments tuples
-- attachments_on_board are the only files that can be copied into the archive dir right now
-- since that's where we have the src filename. except here the Attachment doesn't have a sha hash yet
-- so we can't build the destination filename.
let map_should_exist :: Map.Map (Int64, Text) [(Sites.Site, Boards.Board, Threads.Thread, Posts.Post, At.Paths, At.Attachment)] =
foldl'
(insertRecord (\(_, _, _, _, _, a) -> (At.post_id a, At.board_filename a)))
Map.empty
attachments_on_board
let to_insert_map =
Map.filterWithKey
(\k _ -> not $ k `Map.member` map_existing)
map_should_exist
let to_insert = foldr (++) [] $ Map.elems to_insert_map
to_insert_exist <- filterM attachmentFileExists to_insert
with_hashes <- mapM computeAttachmentHash to_insert_exist
attachments_result <- Client.postAttachments settings with_hashes
case attachments_result of
Left err -> do
putStrLn $ "Error posting attachments: " ++ show err
exitFailure
Right saved -> do
putStrLn $ "Saved " ++ (show $ length saved) ++ " attachments!"
mapM_ (copyFiles settings) attachments_on_board
where
attachmentFileExists :: (Sites.Site, Boards.Board, Threads.Thread, Posts.Post, At.Paths, At.Attachment) -> IO Bool
attachmentFileExists (_, _, _, _, p, _) = doesFileExist (At.file_path p)
computeAttachmentHash :: (Sites.Site, Boards.Board, Threads.Thread, Posts.Post, At.Paths, At.Attachment) -> IO At.Attachment
computeAttachmentHash (_, _, _, _, p, q) = do
let f = At.file_path p
putStrLn $ "Reading " ++ f
-- putStrLn $ show p
-- putStrLn $ show (q { At.sha256_hash = "undefined" })
sha256_sum <- Hash.computeSHA256 f
putStrLn $ "SHA-256: " ++ unpack sha256_sum
phash :: Maybe Int64 <-
case (At.mimetype q) `Set.member` phash_mimetypes of
True -> do
either_phash <- fileHash f
case either_phash of
Left err_str -> do
putStrLn $ "Failed to compute phash for file " ++ (unpack sha256_sum) ++ " " ++ f ++ " " ++ err_str
return Nothing
Right phash_w -> do
let phash_i = Words.wordToSignedInt64 phash_w
if phash_i == 0 then do
putStrLn $ "phash is 0 for file " ++ (unpack sha256_sum) ++ " " ++ f
return Nothing
else do
putStrLn $ "phash: " ++ show phash_w
return $ Just $ Words.wordToSignedInt64 phash_w
False -> return Nothing
return q
{ At.sha256_hash = sha256_sum
, At.phash = phash
}
parseLegacyPaths :: JSONPosts.Post -> Maybe (At.Paths, At.Attachment)
parseLegacyPaths post = do
tim <- JSONPosts.tim post
ext <- JSONPosts.ext post
filename <- JSONPosts.filename post
size <- JSONPosts.fsize post
spoiler <- JSONPosts.fsize post
let
board = JSONPosts.board post
file_path = withPathPrefix $ board <> "/src/" <> tim <> ext
thumbnail_path = withPathPrefix $ board <> "/thumb/" <> tim <> ext
p = At.Paths file_path thumbnail_path
mime = getMimeType ext
attachment = At.Attachment
{ At.mimetype = mime
, At.creation_time = undefined
, At.sha256_hash = undefined
, At.phash = Nothing
, At.illegal = False
, At.post_id = undefined
, At.resolution = undefined
, At.file_extension = Just $ T.drop 1 ext
, At.thumb_extension = Just $ "png"
, At.original_filename = Just $ filename <> ext
, At.file_size_bytes = size
, At.board_filename = tim
, At.spoiler = spoiler > 0
, At.attachment_idx = 1
}
return (p, attachment)
notDeleted :: (a, b, c, d, At.Paths, At.Attachment) -> Bool
notDeleted (_, _, _, _, p, _) = not $ "deleted" `isSuffixOf` (At.file_path p)
withPathPrefix :: Text -> FilePath
withPathPrefix = ((<>) $ backup_read_root settings) . unpack
parseAttachments
:: (Sites.Site, Boards.Board, Threads.Thread, JSONPosts.Post, Posts.Post)
-> [(Sites.Site, Boards.Board, Threads.Thread, Posts.Post, At.Paths, At.Attachment)]
parseAttachments (site, board, thread, p, q) = filter notDeleted $
case JSONPosts.files p of
Just files -> map
(\(i, x) ->
( site
, board
, thread
, q
, At.Paths (withPathPrefix $ JS.file_path x) (withPathPrefix $ JS.thumb_path x)
, fileToAttachment i q x
)
) (zip [1..] files)
Nothing ->
case parseLegacyPaths p of
Nothing -> []
Just (paths, a) ->
let
dim = (JSONPosts.w p) >>= \w -> ((JSONPosts.h p) >>= \h -> Just $ At.Dimension w h)
in
( site
, board
, thread
, q
, paths
, a
{ At.creation_time = Posts.creation_time q
, At.resolution = dim
, At.post_id = fromJust $ Posts.post_id q
}
) : []
insertRecord
:: Ord a
=> (b -> a)
-> Map.Map a [b]
-> b
-> Map.Map a [b]
insertRecord getKey accMap x =
let pid = getKey x
l = Map.findWithDefault [] pid accMap
in Map.insert pid (x : l) accMap
createNewPosts
:: JSONSettings
-> [ (Threads.Thread, JSONPosts.Post, Client.PostId) ]
-> IO [ Posts.Post ]
createNewPosts settings tuples = do
existing_post_results <- Client.getPosts settings $ map (\(_, _, c) -> c) tuples
existing_posts <- either handleError return existing_post_results
thread_max_local_idx_result <- Client.getThreadMaxLocalIdx settings thread_ids
thread_max_local_idxs <- either handleError return thread_max_local_idx_result
let existing_set :: Set (Int64, Int64) = Set.fromList (map (\x -> (Posts.thread_id x, Posts.board_post_id x)) existing_posts)
let to_insert_list :: [ (Threads.Thread, JSONPosts.Post, Client.PostId) ] =
sortBy (comparing $ \(_, _, p) -> Client.board_post_id p) $
newPosts tuples existing_set
-- Map of thread_id to the largest local_idx value (which would be the number of the last post in the thread)
let local_idx :: Map.Map Int64 Int = Map.fromList thread_max_local_idxs
let insert_posts :: [ Posts.Post ] = fst $ foldl' foldFn ([], local_idx) to_insert_list
-- posts to insert are the posts that are not in existing_posts
-- so we create a Set (thread_id, board_post_id) ✓
-- then check every tuples against the set and the ones not in the set get added to a to_insert_list ✓
-- also for every tuples we need to compute a local_idx
-- so we create a Map index_map from thread_id to local_idx ✓
-- - for existing_posts
-- - need to compare posts already in the map with another post and keep the max local_idx ✓
-- to get the new local_idx, we must order the to_insert_list by board_post_id, and look up each entry ✓
print insert_posts
posts_result <- Client.postPosts settings insert_posts
new_posts <- either handleError return posts_result
return $ existing_posts ++ new_posts
where
handleError err = print err >> exitFailure
thread_ids :: [ Int64 ]
thread_ids = Set.elems $ Set.fromList $ map (\(t, _, _) -> Threads.thread_id t) tuples
newPosts :: [(Threads.Thread, JSONPosts.Post, Client.PostId)] -> Set (Int64, Int64) -> [(Threads.Thread, JSONPosts.Post, Client.PostId)]
newPosts ts existing_set = filter (\(_, _, c) -> Set.notMember (Client.thread_id c, Client.board_post_id c) existing_set) ts
foldFn
:: ([Posts.Post], Map.Map Int64 Int)
-> (Threads.Thread, JSONPosts.Post, Client.PostId)
-> ([Posts.Post], Map.Map Int64 Int)
foldFn (posts, idx_map) (t, p, c) =
case Map.lookup thread_id idx_map of
Nothing -> (post 1 : posts, Map.insert thread_id 1 idx_map)
Just i -> (post (i + 1) : posts, Map.insert thread_id (i + 1) idx_map)
where
post :: Int -> Posts.Post
post i = apiPostToArchivePost i t p
thread_id = Client.thread_id c
processBoard :: JSONSettings -> Sites.Site -> Boards.Board -> IO ()
processBoard settings site board = do
let catalogPath = backupDir </> "catalog.json"
putStrLn $ "catalog file path: " ++ catalogPath
result <- parseJSONCatalog catalogPath
case result of
Right catalogs -> do
let threads_on_board = concatMap ((maybe [] id) . threads) catalogs
all_threads_for_board :: [ Threads.Thread ] <- ensureThreads settings board threads_on_board
all_posts_on_board :: [(Threads.Thread, [ JSONPosts.Post ])] <- mapM (readPosts settings board) all_threads_for_board
let tuples :: [(Sites.Site, Boards.Board, Threads.Thread, JSONPosts.Post)] = concatMap
(\(t, posts) -> map (\p -> (site, board, t, p)) posts)
all_posts_on_board
posts_result :: [ Posts.Post ] <- createNewPosts settings (map (\(_, _, c, d) -> (c, d, apiPostToPostKey c d)) tuples)
putStrLn "Sum of post_ids:"
print $ sum $ map (fromJust . Posts.post_id) posts_result
putStrLn "Sum of board_post_ids:"
print $ sum $ map Posts.board_post_id posts_result
let perfect_post_pairs = addPostsToTuples tuples posts_result
processFiles settings perfect_post_pairs
Left errMsg ->
putStrLn $ "Failed to parse the JSON file in directory: "
++ (Boards.pathpart board) ++ ". Error: " ++ errMsg
where
backupDir :: FilePath
backupDir = backup_read_root settings </> (Boards.pathpart board)
processBackupDirectory :: JSONSettings -> IO ()
processBackupDirectory settings = do
putStrLn "JSON successfully read!"
print settings -- print the decoded JSON settings
site :: Sites.Site <- ensureSiteExists settings
dirs <- listCatalogDirectories settings
let dirsSet = Set.fromList dirs
let site_id_ = Sites.site_id site
boards_result <- Client.getSiteBoards settings site_id_
putStrLn "Boards fetched!"
case boards_result of
Left err -> do
putStrLn $ "Error fetching boards: " ++ show err
exitFailure
Right archived_boards -> do
let boardnames = map Boards.pathpart archived_boards
created_boards <- createArchivesForNewBoards settings dirsSet boardnames site_id_
let boards :: [ Boards.Board ] = archived_boards ++ created_boards
let boards_we_have_data_for = filter (\board -> Set.member (Boards.pathpart board) dirsSet) boards
mapM_ (processBoard settings site) boards_we_have_data_for

View File

@ -1,4 +1,6 @@
module Main where {-# LANGUAGE RecordWildCards #-}
module Main (main) where
import System.Exit (exitFailure) import System.Exit (exitFailure)
import qualified Data.ByteString.Lazy as B import qualified Data.ByteString.Lazy as B
@ -6,11 +8,26 @@ import System.Console.CmdArgs (cmdArgs, Data, Typeable)
import Data.Aeson (decode) import Data.Aeson (decode)
import Common.Server.ConsumerSettings import Common.Server.ConsumerSettings
import Common.Server.JSONSettings as J
import Lib
( ensureSiteExists
)
newtype CliArgs = CliArgs newtype CliArgs = CliArgs
{ settingsFile :: String { settingsFile :: String
} deriving (Show, Data, Typeable) } deriving (Show, Data, Typeable)
toClientSettings :: ConsumerJSONSettings -> JSONSiteSettings -> J.JSONSettings
toClientSettings ConsumerJSONSettings {..} JSONSiteSettings {..} =
J.JSONSettings
{ J.postgrest_url = postgrest_url
, J.jwt = jwt
, J.backup_read_root = undefined
, J.media_root_path = media_root_path
, J.site_name = name
, J.site_url = root_url
}
getSettings :: IO ConsumerJSONSettings getSettings :: IO ConsumerJSONSettings
getSettings = do getSettings = do
cliArgs <- cmdArgs $ CliArgs "consumer_settings.json" cliArgs <- cmdArgs $ CliArgs "consumer_settings.json"
@ -30,9 +47,17 @@ getSettings = do
Just settings -> return settings Just settings -> return settings
processWebsite :: ConsumerJSONSettings -> JSONSiteSettings -> IO ()
processWebsite settings site_settings = do
let client_settings = toClientSettings settings site_settings
site <- ensureSiteExists client_settings
return ()
main :: IO () main :: IO ()
main = do main = do
putStrLn "Hello World" putStrLn "Starting channel web synchronization."
settings <- getSettings settings <- getSettings
print settings print settings
mapM_ (processWebsite settings) (websites settings)