Compare commits
19 Commits
|
@ -2,7 +2,7 @@
|
|||
"postgrest_url": "http://localhost:3000",
|
||||
"jwt": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJyb2xlIjoiY2hhbl9hcmNoaXZlciJ9.rGIKZokTDKTuQLIv8138bUby5PELfDipYYIDpJzH02c",
|
||||
"backup_read_root": "/home/phil/linixy/tmp/leftypol_back/lainchan.leftypol.org",
|
||||
"media_root_path": "/home/phil/linixy/tmp/chan_archive_media",
|
||||
"media_root_path": "/home/phil/linixy/tmp/chan_archive_media2/archive",
|
||||
"site_name": "leftychan",
|
||||
"site_url": "https://leftychan.net"
|
||||
}
|
||||
|
|
|
@ -66,6 +66,7 @@ executable chan-delorean
|
|||
|
||||
-- Modules included in this executable, other than Main.
|
||||
other-modules:
|
||||
Lib
|
||||
JSONParsing
|
||||
SitesType
|
||||
BoardsType
|
||||
|
@ -79,7 +80,9 @@ executable chan-delorean
|
|||
Data.WordUtil
|
||||
Network.DataClient
|
||||
Network.DataClientTypes
|
||||
Network.GetLatestPostsPerBoardResponse
|
||||
Common.Server.JSONSettings
|
||||
Common.Server.ConsumerSettings
|
||||
|
||||
-- LANGUAGE extensions used by modules in this package.
|
||||
-- other-extensions:
|
||||
|
@ -101,7 +104,71 @@ executable chan-delorean
|
|||
cryptonite,
|
||||
memory,
|
||||
mime-types,
|
||||
perceptual-hash
|
||||
perceptual-hash,
|
||||
temporary
|
||||
|
||||
-- Directories containing source files.
|
||||
hs-source-dirs: src
|
||||
|
||||
-- Base language which the package is written in.
|
||||
default-language: GHC2021
|
||||
|
||||
executable chan-delorean-consoomer
|
||||
-- Import common warning flags.
|
||||
import: warnings
|
||||
|
||||
-- .hs or .lhs file containing the Main module.
|
||||
main-is: Main.hs
|
||||
|
||||
-- Modules included in this executable, other than Main.
|
||||
other-modules:
|
||||
Lib
|
||||
JSONParsing
|
||||
SitesType
|
||||
BoardsType
|
||||
ThreadType
|
||||
JSONPost
|
||||
JSONCommonTypes
|
||||
Common.PostsType
|
||||
Common.AttachmentType
|
||||
Common.Network.HttpClient
|
||||
Hash
|
||||
Data.WordUtil
|
||||
Network.DataClient
|
||||
Network.DataClientTypes
|
||||
Network.GetLatestPostsPerBoardResponse
|
||||
Common.Server.ConsumerSettings
|
||||
Common.Server.JSONSettings
|
||||
PriorityQueue
|
||||
Sync
|
||||
BoardQueueElem
|
||||
|
||||
-- LANGUAGE extensions used by modules in this package.
|
||||
-- other-extensions:
|
||||
|
||||
-- Other library packages from which modules are imported.
|
||||
build-depends: base,
|
||||
aeson,
|
||||
bytestring,
|
||||
cmdargs,
|
||||
directory,
|
||||
filepath,
|
||||
containers,
|
||||
text,
|
||||
http-client,
|
||||
http-conduit,
|
||||
safe-exceptions,
|
||||
http-types,
|
||||
time,
|
||||
cryptonite,
|
||||
memory,
|
||||
mime-types,
|
||||
perceptual-hash,
|
||||
async,
|
||||
temporary,
|
||||
stm,
|
||||
random
|
||||
-- transformers
|
||||
|
||||
-- Directories containing source files.
|
||||
hs-source-dirs: src
|
||||
|
|
|
@ -0,0 +1,17 @@
|
|||
{
|
||||
"websites": [
|
||||
{
|
||||
"name": "example",
|
||||
"root_url": "https://example.net",
|
||||
"boards": [
|
||||
"tech",
|
||||
"meta"
|
||||
]
|
||||
}
|
||||
],
|
||||
"postgrest_url": "http://localhost:3000",
|
||||
"jwt": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJyb2xlIjoiY2hhbl9hcmNoaXZlciJ9.rGIKZokTDKTuQLIv8138bUby5PELfDipYYIDpJzH02c",
|
||||
"media_root_path": "/home/phil/linixy/tmp/chan_archive_media_repaired/archive",
|
||||
"http_fill_all": false,
|
||||
"http_sync_continously": true
|
||||
}
|
|
@ -222,3 +222,99 @@ SELECT * FROM boards;
|
|||
|
||||
SELECT * FROM threads WHERE thread_id = 11314;
|
||||
ANALYZE posts;
|
||||
|
||||
SELECT count(*) from attachments;
|
||||
|
||||
SELECT * FROM attachments WHERE post_id = 253383;
|
||||
SELECT * from attachments WHERE board_filename = '1722466065515';
|
||||
SELECT count(*) attachments WHERE attachment_id < (SELECT attachment_id FROM attachments WHERE board_filename = '1722466065515');
|
||||
SELECT max(attachment_id) FROM attachments a;
|
||||
SELECT pg_get_serial_sequence('attachments', 'attachment_id');
|
||||
SELECT setval(pg_get_serial_sequence('attachments', 'attachment_id'), COALESCE(198853, 1), true);
|
||||
|
||||
|
||||
UPDATE attachments SET thumb_extension = 'png'
|
||||
WHERE
|
||||
attachment_id IN
|
||||
(
|
||||
SELECT a.attachment_id
|
||||
FROM attachments a
|
||||
JOIN posts p ON a.post_id = p.post_id
|
||||
JOIN threads t ON p.thread_id = t.thread_id
|
||||
JOIN boards b ON t.board_id = b.board_id
|
||||
JOIN sites s ON b.site_id = s.site_id
|
||||
WHERE s.name = 'leftychan'
|
||||
AND a.thumb_extension = 'jpg'
|
||||
);
|
||||
|
||||
|
||||
SELECT * FROM posts WHERE board_post_id = 129;
|
||||
SELECT * FROM attachments WHERE post_id = 461287;
|
||||
|
||||
SELECT count(a.*)
|
||||
FROM attachments a
|
||||
JOIN posts p ON a.post_id = p.post_id
|
||||
JOIN threads t ON p.thread_id = t.thread_id
|
||||
JOIN boards b ON t.board_id = b.board_id
|
||||
JOIN sites s ON b.site_id = s.site_id
|
||||
WHERE s.name = 'leftychan'
|
||||
AND a.thumb_extension = 'jpg';
|
||||
|
||||
|
||||
SELECT * FROM posts
|
||||
JOIN threads ON threads.thread_id = posts.thread_id
|
||||
JOIN boards ON boards.board_id = threads.board_id
|
||||
WHERE boards.pathpart = 'leftypol'
|
||||
AND boards.site_id = 1
|
||||
ORDER BY posts.creation_time DESC
|
||||
LIMIT 1;
|
||||
|
||||
SELECT * FROM posts
|
||||
ORDER BY posts.creation_time DESC
|
||||
LIMIT 1;
|
||||
|
||||
SELECT boards.board_id, boards.pathpart, sites.name FROM boards JOIN sites ON sites.site_id = boards.site_id;
|
||||
|
||||
SELECT DISTINCT ON (b.board_id)
|
||||
b.board_id,
|
||||
b.site_id,
|
||||
b.pathpart,
|
||||
p.post_id,
|
||||
p.board_post_id,
|
||||
p.creation_time,
|
||||
p.body,
|
||||
t.thread_id,
|
||||
t.board_thread_id
|
||||
FROM boards b
|
||||
JOIN threads t ON t.board_id = b.board_id
|
||||
JOIN posts p ON p.thread_id = t.thread_id
|
||||
ORDER BY b.board_id, p.creation_time DESC;
|
||||
|
||||
|
||||
CREATE OR REPLACE FUNCTION get_latest_posts_per_board()
|
||||
RETURNS TABLE (
|
||||
board_id int,
|
||||
site_id int,
|
||||
pathpart text,
|
||||
post_id bigint,
|
||||
board_post_id bigint,
|
||||
creation_time timestamp with time zone,
|
||||
thread_id bigint,
|
||||
board_thread_id bigint
|
||||
) AS $$
|
||||
SELECT DISTINCT ON (b.board_id)
|
||||
b.board_id,
|
||||
b.site_id,
|
||||
b.pathpart,
|
||||
p.post_id,
|
||||
p.board_post_id,
|
||||
p.creation_time,
|
||||
t.thread_id,
|
||||
t.board_thread_id
|
||||
FROM boards b
|
||||
JOIN threads t ON t.board_id = b.board_id
|
||||
JOIN posts p ON p.thread_id = t.thread_id
|
||||
ORDER BY b.board_id, p.creation_time DESC;
|
||||
$$ LANGUAGE sql STABLE;
|
||||
|
||||
SELECT * FROM get_latest_posts_per_board();
|
||||
|
|
|
@ -21,6 +21,7 @@ DROP TYPE IF EXISTS post_key CASCADE;
|
|||
DROP FUNCTION IF EXISTS update_post_body_search_index;
|
||||
DROP FUNCTION IF EXISTS fetch_top_threads;
|
||||
DROP FUNCTION IF EXISTS fetch_catalog;
|
||||
DROP FUNCTION IF EXISTS get_latest_posts_per_board;
|
||||
|
||||
|
||||
-- It won't let us drop roles otherwise and the IFs are to keep this script idempotent.
|
||||
|
@ -222,7 +223,7 @@ CREATE OR REPLACE FUNCTION fetch_top_threads(
|
|||
lookback INT DEFAULT 10000
|
||||
)
|
||||
RETURNS TABLE(bump_time TIMESTAMPTZ, post_count BIGINT, thread_id BIGINT, where_to_leave_off TIMESTAMPTZ)
|
||||
LANGUAGE sql
|
||||
LANGUAGE sql STABLE
|
||||
AS $$
|
||||
SELECT
|
||||
max(creation_time) as bump_time,
|
||||
|
@ -266,6 +267,51 @@ CREATE TYPE catalog_grid_result AS
|
|||
);
|
||||
|
||||
|
||||
CREATE OR REPLACE FUNCTION fetch_catalog(max_time timestamptz, max_row_read int DEFAULT 10000)
|
||||
RETURNS SETOF catalog_grid_result AS $$
|
||||
WITH
|
||||
top AS
|
||||
(
|
||||
SELECT * FROM fetch_top_threads(max_time, max_row_read) AS top
|
||||
),
|
||||
tall_posts AS
|
||||
(
|
||||
SELECT
|
||||
top.post_count AS estimated_post_count,
|
||||
posts.post_id,
|
||||
posts.board_post_id,
|
||||
posts.creation_time,
|
||||
top.bump_time,
|
||||
posts.body,
|
||||
posts.subject,
|
||||
posts.thread_id,
|
||||
posts.embed
|
||||
FROM top
|
||||
JOIN posts ON top.thread_id = posts.thread_id AND posts.local_idx = 1
|
||||
WHERE creation_time < max_time
|
||||
)
|
||||
SELECT
|
||||
-- post_counts.post_count,
|
||||
tall_posts.*,
|
||||
threads.board_thread_id, -- this should be part of the url path when creating links, not thread_id (that's internal)
|
||||
boards.pathpart,
|
||||
sites."name",
|
||||
-- sites.site_id,
|
||||
attachments.mimetype AS file_mimetype,
|
||||
attachments.illegal AS file_illegal,
|
||||
-- attachments.resolution AS file_resolution,
|
||||
attachments.board_filename AS file_name,
|
||||
attachments.file_extension,
|
||||
attachments.thumb_extension AS file_thumb_extension
|
||||
FROM tall_posts
|
||||
JOIN threads ON tall_posts.thread_id = threads.thread_id
|
||||
JOIN boards ON threads.board_id = boards.board_id
|
||||
JOIN sites ON sites.site_id = boards.site_id
|
||||
LEFT OUTER JOIN attachments ON attachments.post_id = tall_posts.post_id AND attachments.attachment_idx = 1
|
||||
ORDER BY bump_time DESC;
|
||||
$$ LANGUAGE sql STABLE;
|
||||
|
||||
|
||||
-- Function: search_posts
|
||||
--
|
||||
-- This function performs a full-text search on the `posts` table using PostgreSQL's text search features.
|
||||
|
@ -355,6 +401,33 @@ RETURNS SETOF catalog_grid_result AS $$
|
|||
$$ LANGUAGE sql STABLE;
|
||||
|
||||
|
||||
CREATE OR REPLACE FUNCTION get_latest_posts_per_board()
|
||||
RETURNS TABLE (
|
||||
board_id int,
|
||||
site_id int,
|
||||
pathpart text,
|
||||
post_id bigint,
|
||||
board_post_id bigint,
|
||||
creation_time timestamp with time zone,
|
||||
thread_id bigint,
|
||||
board_thread_id bigint
|
||||
) AS $$
|
||||
SELECT DISTINCT ON (b.board_id)
|
||||
b.board_id,
|
||||
b.site_id,
|
||||
b.pathpart,
|
||||
p.post_id,
|
||||
p.board_post_id,
|
||||
p.creation_time,
|
||||
t.thread_id,
|
||||
t.board_thread_id
|
||||
FROM boards b
|
||||
JOIN threads t ON t.board_id = b.board_id
|
||||
JOIN posts p ON p.thread_id = t.thread_id
|
||||
ORDER BY b.board_id, p.creation_time DESC;
|
||||
$$ LANGUAGE sql STABLE;
|
||||
|
||||
|
||||
/*
|
||||
* Permissions
|
||||
*/
|
||||
|
@ -364,18 +437,20 @@ REVOKE EXECUTE ON FUNCTION fetch_catalog FROM PUBLIC;
|
|||
REVOKE EXECUTE ON FUNCTION search_posts FROM PUBLIC;
|
||||
REVOKE EXECUTE ON FUNCTION update_post_body_search_index FROM PUBLIC;
|
||||
REVOKE EXECUTE ON FUNCTION get_posts FROM PUBLIC;
|
||||
REVOKE EXECUTE ON FUNCTION get_latest_posts_per_board FROM PUBLIC;
|
||||
|
||||
CREATE ROLE chan_archive_anon nologin;
|
||||
GRANT CONNECT ON DATABASE chan_archives TO chan_archive_anon;
|
||||
GRANT SELECT ON sites TO chan_archive_anon;
|
||||
GRANT SELECT ON boards TO chan_archive_anon;
|
||||
GRANT SELECT ON threads TO chan_archive_anon;
|
||||
GRANT SELECT ON posts TO chan_archive_anon;
|
||||
GRANT SELECT ON attachments TO chan_archive_anon;
|
||||
GRANT EXECUTE ON FUNCTION fetch_catalog TO chan_archive_anon;
|
||||
GRANT EXECUTE ON FUNCTION fetch_top_threads TO chan_archive_anon;
|
||||
GRANT EXECUTE ON FUNCTION search_posts TO chan_archive_anon;
|
||||
GRANT EXECUTE ON FUNCTION get_posts TO chan_archive_anon;
|
||||
GRANT CONNECT ON DATABASE chan_archives TO chan_archive_anon;
|
||||
GRANT SELECT ON sites TO chan_archive_anon;
|
||||
GRANT SELECT ON boards TO chan_archive_anon;
|
||||
GRANT SELECT ON threads TO chan_archive_anon;
|
||||
GRANT SELECT ON posts TO chan_archive_anon;
|
||||
GRANT SELECT ON attachments TO chan_archive_anon;
|
||||
GRANT EXECUTE ON FUNCTION fetch_catalog TO chan_archive_anon;
|
||||
GRANT EXECUTE ON FUNCTION fetch_top_threads TO chan_archive_anon;
|
||||
GRANT EXECUTE ON FUNCTION search_posts TO chan_archive_anon;
|
||||
GRANT EXECUTE ON FUNCTION get_posts TO chan_archive_anon;
|
||||
GRANT EXECUTE ON FUNCTION get_latest_posts_per_board TO chan_archive_anon;
|
||||
|
||||
-- GRANT usage, select ON SEQUENCE sites_site_id_seq TO chan_archive_anon;
|
||||
-- GRANT usage, select ON SEQUENCE boards_board_id_seq TO chan_archive_anon;
|
||||
|
@ -396,6 +471,7 @@ GRANT EXECUTE ON FUNCTION fetch_top_threads TO chan_archiver;
|
|||
GRANT EXECUTE ON FUNCTION fetch_catalog TO chan_archiver;
|
||||
GRANT EXECUTE ON FUNCTION search_posts TO chan_archiver;
|
||||
GRANT EXECUTE ON FUNCTION get_posts TO chan_archiver;
|
||||
GRANT EXECUTE ON FUNCTION get_latest_posts_per_board TO chan_archiver;
|
||||
GRANT usage, select ON SEQUENCE sites_site_id_seq TO chan_archiver;
|
||||
GRANT usage, select ON SEQUENCE boards_board_id_seq TO chan_archiver;
|
||||
GRANT usage, select ON SEQUENCE threads_thread_id_seq TO chan_archiver;
|
||||
|
|
660
src/Backfill.hs
660
src/Backfill.hs
|
@ -1,668 +1,12 @@
|
|||
{-# OPTIONS_GHC -Wno-unrecognised-pragmas #-}
|
||||
{-# HLINT ignore "Redundant bracket" #-}
|
||||
{-# HLINT ignore "Use fromMaybe" #-}
|
||||
{-# LANGUAGE OverloadedStrings #-}
|
||||
|
||||
module Main where
|
||||
|
||||
import System.Exit
|
||||
import Data.Int (Int64)
|
||||
import Control.Monad (filterM)
|
||||
import Data.Aeson (decode)
|
||||
import qualified Data.ByteString.Lazy as B
|
||||
import Data.Aeson (decode)
|
||||
import System.Console.CmdArgs
|
||||
import System.Directory
|
||||
( listDirectory
|
||||
, doesFileExist
|
||||
, copyFile
|
||||
, createDirectoryIfMissing
|
||||
)
|
||||
import System.FilePath ((</>), (<.>), takeExtension)
|
||||
import Data.List (find, isSuffixOf, foldl', sortBy)
|
||||
import Data.Ord (comparing)
|
||||
import qualified Data.Set as Set
|
||||
import Data.Set (Set)
|
||||
import Data.Time.Clock.POSIX (posixSecondsToUTCTime)
|
||||
import Data.Time.Clock (UTCTime)
|
||||
import qualified Data.Map as Map
|
||||
import Data.Maybe (fromJust)
|
||||
import Data.Text (Text, unpack)
|
||||
import qualified Data.Text as T
|
||||
import Data.Text.Encoding (decodeUtf8)
|
||||
import Network.Mime (defaultMimeLookup)
|
||||
import PerceptualHash (fileHash)
|
||||
|
||||
import JSONParsing
|
||||
import Common.Server.JSONSettings
|
||||
import qualified JSONCommonTypes as JS
|
||||
import qualified JSONPost as JSONPosts
|
||||
import qualified Network.DataClient as Client
|
||||
import qualified SitesType as Sites
|
||||
import qualified BoardsType as Boards
|
||||
import qualified ThreadType as Threads
|
||||
import qualified Common.AttachmentType as At
|
||||
import qualified Common.PostsType as Posts
|
||||
import qualified Hash as Hash
|
||||
import qualified Data.WordUtil as Words
|
||||
|
||||
newtype SettingsCLI = SettingsCLI
|
||||
{ jsonFile :: FilePath
|
||||
} deriving (Show, Data, Typeable)
|
||||
|
||||
|
||||
listCatalogDirectories :: JSONSettings -> IO [ FilePath ]
|
||||
listCatalogDirectories settings = do
|
||||
allDirs <- listDirectory (backup_read_root settings)
|
||||
let filteredDirs = filter (`notElem` excludedDirs) allDirs
|
||||
filterM hasCatalog filteredDirs
|
||||
|
||||
where
|
||||
excludedDirs = ["sfw", "alt", "overboard"]
|
||||
|
||||
hasCatalog dir = do
|
||||
let catalogPath = backup_read_root settings </> dir </> "catalog.json"
|
||||
doesFileExist catalogPath
|
||||
|
||||
|
||||
ensureSiteExists :: JSONSettings -> IO Sites.Site
|
||||
ensureSiteExists settings = do
|
||||
sitesResult <- Client.getAllSites settings
|
||||
|
||||
case sitesResult of
|
||||
Right siteList ->
|
||||
case find (\site -> Sites.name site == site_name settings) siteList of
|
||||
Just site -> do
|
||||
putStrLn $ site_name settings ++ " already exists!"
|
||||
return site
|
||||
Nothing -> do
|
||||
putStrLn $ site_name settings ++ " does not exist. Creating..."
|
||||
postResult <- Client.postSite settings
|
||||
|
||||
case postResult of
|
||||
Right (site:_) -> do
|
||||
putStrLn $ "Successfully created " ++ site_name settings ++ ". " ++ show site
|
||||
return site
|
||||
Right [] -> do
|
||||
putStrLn "Did not get new site id back from postgrest"
|
||||
exitFailure
|
||||
Left err -> do
|
||||
putStrLn $ "Failed to create " ++ site_name settings
|
||||
++ " Error: " ++ show err
|
||||
exitFailure
|
||||
|
||||
Left err -> do
|
||||
putStrLn $ "Error fetching sites: " ++ show err
|
||||
exitFailure
|
||||
|
||||
|
||||
createArchivesForNewBoards
|
||||
:: JSONSettings
|
||||
-> Set String
|
||||
-> [ String ]
|
||||
-> Int
|
||||
-> IO [ Boards.Board ]
|
||||
createArchivesForNewBoards settings dirsSet archived_boards siteid = do
|
||||
let archivedBoardsSet = Set.fromList archived_boards
|
||||
|
||||
-- Find boards that are in dirs but not in archived_boards
|
||||
let boardsToArchive = dirsSet `Set.difference` archivedBoardsSet
|
||||
|
||||
putStrLn "Creating boards:"
|
||||
mapM_ putStrLn boardsToArchive
|
||||
|
||||
post_result <- Client.postBoards settings (Set.toList boardsToArchive) siteid
|
||||
|
||||
case post_result of
|
||||
Left err -> do
|
||||
putStrLn $ "Error posting boards: " ++ show err
|
||||
exitFailure
|
||||
Right boards -> do
|
||||
putStrLn "Created the following boards:"
|
||||
mapM_ (putStrLn . Boards.pathpart) boards
|
||||
return boards
|
||||
|
||||
|
||||
apiThreadToArchiveThread :: Int -> Thread -> Threads.Thread
|
||||
apiThreadToArchiveThread board_id_ json_thread =
|
||||
Threads.Thread
|
||||
{ Threads.thread_id = undefined
|
||||
, Threads.board_thread_id = no json_thread
|
||||
, Threads.creation_time = epochToUTCTime $ fromIntegral (time json_thread)
|
||||
, Threads.board_id = board_id_
|
||||
}
|
||||
|
||||
epochToUTCTime :: Int -> UTCTime
|
||||
epochToUTCTime = posixSecondsToUTCTime . realToFrac
|
||||
|
||||
|
||||
createArchivesForNewThreads
|
||||
:: JSONSettings
|
||||
-> [ Thread ]
|
||||
-> [ Threads.Thread ]
|
||||
-> Boards.Board
|
||||
-> IO [ Threads.Thread ]
|
||||
createArchivesForNewThreads settings all_threads archived_threads board = do
|
||||
putStrLn $ "Creating " ++ show (length threads_to_create) ++ " threads."
|
||||
threads_result <- Client.postThreads settings (map (apiThreadToArchiveThread board_id) threads_to_create)
|
||||
|
||||
case threads_result of
|
||||
Left err -> do
|
||||
putStrLn $ "Error creating threads: " ++ show err
|
||||
exitFailure
|
||||
Right new_threads -> return new_threads
|
||||
|
||||
where
|
||||
board_id :: Int = Boards.board_id board
|
||||
|
||||
archived_board_thread_ids :: Set.Set Int
|
||||
archived_board_thread_ids =
|
||||
Set.fromList $ map Threads.board_thread_id archived_threads
|
||||
|
||||
threads_to_create :: [ Thread ]
|
||||
threads_to_create =
|
||||
filter
|
||||
((`Set.notMember` archived_board_thread_ids) . no)
|
||||
all_threads
|
||||
|
||||
|
||||
ensureThreads :: JSONSettings -> Boards.Board -> [ Thread ] -> IO [ Threads.Thread ]
|
||||
ensureThreads settings board all_threads = do
|
||||
threads_result <- Client.getThreads settings (Boards.board_id board) (map no all_threads)
|
||||
|
||||
case threads_result of
|
||||
Left err -> do
|
||||
putStrLn $ "Error fetching threads: " ++ show err
|
||||
exitFailure
|
||||
Right archived_threads -> do
|
||||
putStrLn $ show (length archived_threads) ++ " threads already exist."
|
||||
new_threads <- createArchivesForNewThreads settings all_threads archived_threads board
|
||||
return $ archived_threads ++ new_threads
|
||||
|
||||
|
||||
readPosts
|
||||
:: JSONSettings
|
||||
-> Boards.Board
|
||||
-> Threads.Thread
|
||||
-> IO (Threads.Thread, [ JSONPosts.Post ])
|
||||
readPosts settings board thread = do
|
||||
result <- parsePosts thread_filename
|
||||
|
||||
case result of
|
||||
Left err -> do
|
||||
putStrLn $ "Failed to parse the JSON file " ++ thread_filename ++ " error: " ++ err
|
||||
return (thread, [])
|
||||
Right posts_wrapper -> return (thread, JSONPosts.posts posts_wrapper)
|
||||
|
||||
where
|
||||
thread_filename :: FilePath
|
||||
thread_filename = backupDir </> "res" </> (show (Threads.board_thread_id thread) ++ ".json")
|
||||
|
||||
backupDir :: FilePath
|
||||
backupDir = backup_read_root settings </> Boards.pathpart board
|
||||
|
||||
|
||||
apiPostToPostKey :: Threads.Thread -> JSONPosts.Post -> Client.PostId
|
||||
apiPostToPostKey thread post =
|
||||
Client.PostId
|
||||
{ Client.thread_id = (Threads.thread_id thread)
|
||||
, Client.board_post_id = (JSONPosts.no post)
|
||||
}
|
||||
|
||||
-- Convert Post to DbPost
|
||||
apiPostToArchivePost :: Int -> Threads.Thread -> JSONPosts.Post -> Posts.Post
|
||||
apiPostToArchivePost local_idx thread post =
|
||||
Posts.Post
|
||||
{ Posts.post_id = Nothing
|
||||
, Posts.board_post_id = JSONPosts.no post
|
||||
, Posts.creation_time = posixSecondsToUTCTime (realToFrac $ JSONPosts.time post)
|
||||
, Posts.body = JSONPosts.com post
|
||||
, Posts.name = JSONPosts.name post
|
||||
, Posts.subject = JSONPosts.sub post
|
||||
, Posts.email = JSONPosts.email post
|
||||
, Posts.thread_id = Threads.thread_id thread
|
||||
, Posts.embed = JSONPosts.embed post
|
||||
, Posts.local_idx = local_idx
|
||||
}
|
||||
|
||||
-- | A version of 'concatMap' that works with a monadic predicate.
|
||||
-- Stolen from package extra Control.Monad.Extra
|
||||
concatMapM :: Monad m => (a -> m [b]) -> [a] -> m [b]
|
||||
{-# INLINE concatMapM #-}
|
||||
concatMapM op = foldr f (pure [])
|
||||
where f x xs = do
|
||||
x_ <- op x
|
||||
|
||||
if null x_
|
||||
then xs
|
||||
else do
|
||||
xs_ <- xs
|
||||
pure $ x_ ++ xs_
|
||||
|
||||
|
||||
addPostsToTuples
|
||||
:: [(Sites.Site, Boards.Board, Threads.Thread, JSONPosts.Post)]
|
||||
-> [ Posts.Post ]
|
||||
-> [(Sites.Site, Boards.Board, Threads.Thread, JSONPosts.Post, Posts.Post)]
|
||||
addPostsToTuples tuples posts = map f posts
|
||||
where
|
||||
post_map :: Map.Map (Int64, Int64) (Sites.Site, Boards.Board, Threads.Thread, JSONPosts.Post)
|
||||
post_map = Map.fromList (map (\(a, b, c, d) -> ((Threads.thread_id c, JSONPosts.no d), (a, b, c, d))) tuples)
|
||||
|
||||
f :: Posts.Post -> (Sites.Site, Boards.Board, Threads.Thread, JSONPosts.Post, Posts.Post)
|
||||
f new_post =
|
||||
(\(a, b, c, d) -> (a, b, c, d, new_post))
|
||||
(post_map Map.! (Posts.thread_id new_post, Posts.board_post_id new_post))
|
||||
|
||||
|
||||
fileToAttachment :: Int -> Posts.Post -> JS.File -> At.Attachment
|
||||
fileToAttachment i post file =
|
||||
At.Attachment
|
||||
{ At.mimetype = maybe guessed_mime id (JS.mime file)
|
||||
, At.creation_time = Posts.creation_time post
|
||||
, At.sha256_hash = undefined
|
||||
, At.phash = Nothing
|
||||
, At.illegal = False
|
||||
, At.post_id = fromJust $ Posts.post_id post
|
||||
, At.resolution = dim
|
||||
, At.file_extension = Just extension
|
||||
, At.thumb_extension = Just thumb_extension
|
||||
, At.original_filename = Just $ JS.filename file <> "." <> extension
|
||||
, At.file_size_bytes = JS.fsize file
|
||||
, At.board_filename = JS.id file
|
||||
, At.spoiler = maybe False id $ JS.spoiler file
|
||||
, At.attachment_idx = i
|
||||
}
|
||||
|
||||
where
|
||||
extension = JS.ext file
|
||||
|
||||
thumb_extension = T.pack $ drop 1 $ takeExtension $ unpack $ JS.thumb_path file
|
||||
|
||||
guessed_mime = getMimeType extension
|
||||
|
||||
dim = (JS.w file) >>= \w ->
|
||||
((JS.h file) >>= \h ->
|
||||
Just $ At.Dimension w h)
|
||||
|
||||
|
||||
getMimeType :: Text -> Text
|
||||
getMimeType ext = decodeUtf8 $ defaultMimeLookup ext
|
||||
|
||||
|
||||
phash_mimetypes :: Set.Set Text
|
||||
phash_mimetypes = Set.fromList
|
||||
[ "image/jpeg"
|
||||
, "image/png"
|
||||
, "image/gif"
|
||||
]
|
||||
|
||||
|
||||
copyFiles :: JSONSettings -> (Sites.Site, Boards.Board, Threads.Thread, Posts.Post, At.Paths, At.Attachment) -> IO ()
|
||||
copyFiles settings (site, board, thread, _, path, attachment) = do
|
||||
destination_exists <- doesFileExist dest
|
||||
|
||||
if not destination_exists
|
||||
then do
|
||||
src_exists <- doesFileExist src
|
||||
|
||||
createDirectoryIfMissing True common_dest
|
||||
|
||||
if src_exists
|
||||
then putStrLn ("Copying " ++ src) >> copyFile src dest
|
||||
else return ()
|
||||
|
||||
thumb_exists <- doesFileExist thumb_src
|
||||
|
||||
if thumb_exists
|
||||
then putStrLn ("Copying " ++ thumb_src) >> copyFile thumb_src thumb_dest
|
||||
else return ()
|
||||
|
||||
else return ()
|
||||
|
||||
-- src = (At.file_path | At.thumb_path)
|
||||
-- dest = <media_root>/<website_name>/<boardpart>/<board_thread_id>/<sha>.<ext>
|
||||
|
||||
where
|
||||
src :: FilePath
|
||||
src = At.file_path path
|
||||
|
||||
thumb_src :: FilePath
|
||||
thumb_src = At.thumbnail_path path
|
||||
|
||||
dest :: FilePath
|
||||
dest = common_dest
|
||||
</> (unpack $ At.board_filename attachment)
|
||||
<.> (unpack $ fromJust $ At.file_extension attachment)
|
||||
|
||||
thumb_dest :: FilePath
|
||||
thumb_dest = common_dest
|
||||
</> "thumbnail_" <> (unpack $ At.board_filename attachment)
|
||||
<.> (unpack $ fromJust $ At.thumb_extension attachment)
|
||||
|
||||
common_dest :: FilePath
|
||||
common_dest
|
||||
= (media_root_path settings)
|
||||
</> Sites.name site
|
||||
</> Boards.pathpart board
|
||||
</> (show $ Threads.board_thread_id thread)
|
||||
|
||||
|
||||
processFiles :: JSONSettings -> [(Sites.Site, Boards.Board, Threads.Thread, JSONPosts.Post, Posts.Post)] -> IO ()
|
||||
processFiles settings tuples = do -- perfect just means that our posts have ids, they're already inserted into the db
|
||||
let ps = map (\(_, _, _, _, x) -> x) tuples
|
||||
|
||||
existing_attachments_result <- Client.getAttachments settings (map (fromJust . Posts.post_id) ps)
|
||||
|
||||
case existing_attachments_result of
|
||||
Left err -> do
|
||||
putStrLn $ "Error fetching attachments: " ++ show err
|
||||
exitFailure
|
||||
Right existing_attachments -> do
|
||||
let map_existing :: Map.Map (Int64, Text) [ At.Attachment ] =
|
||||
foldl'
|
||||
(insertRecord (\a -> (At.post_id a, At.board_filename a)))
|
||||
Map.empty
|
||||
existing_attachments
|
||||
|
||||
let attachments_on_board :: [(Sites.Site, Boards.Board, Threads.Thread, Posts.Post, At.Paths, At.Attachment)] =
|
||||
concatMap parseAttachments tuples
|
||||
-- attachments_on_board are the only files that can be copied into the archive dir right now
|
||||
-- since that's where we have the src filename. except here the Attachment doesn't have a sha hash yet
|
||||
-- so we can't build the destination filename.
|
||||
|
||||
let map_should_exist :: Map.Map (Int64, Text) [(Sites.Site, Boards.Board, Threads.Thread, Posts.Post, At.Paths, At.Attachment)] =
|
||||
foldl'
|
||||
(insertRecord (\(_, _, _, _, _, a) -> (At.post_id a, At.board_filename a)))
|
||||
Map.empty
|
||||
attachments_on_board
|
||||
|
||||
let to_insert_map =
|
||||
Map.filterWithKey
|
||||
(\k _ -> not $ k `Map.member` map_existing)
|
||||
map_should_exist
|
||||
|
||||
let to_insert = foldr (++) [] $ Map.elems to_insert_map
|
||||
|
||||
to_insert_exist <- filterM attachmentFileExists to_insert
|
||||
|
||||
with_hashes <- mapM computeAttachmentHash to_insert_exist
|
||||
|
||||
attachments_result <- Client.postAttachments settings with_hashes
|
||||
|
||||
case attachments_result of
|
||||
Left err -> do
|
||||
putStrLn $ "Error posting attachments: " ++ show err
|
||||
exitFailure
|
||||
|
||||
Right saved -> do
|
||||
putStrLn $ "Saved " ++ (show $ length saved) ++ " attachments!"
|
||||
mapM_ (copyFiles settings) attachments_on_board
|
||||
|
||||
where
|
||||
attachmentFileExists :: (Sites.Site, Boards.Board, Threads.Thread, Posts.Post, At.Paths, At.Attachment) -> IO Bool
|
||||
attachmentFileExists (_, _, _, _, p, _) = doesFileExist (At.file_path p)
|
||||
|
||||
computeAttachmentHash :: (Sites.Site, Boards.Board, Threads.Thread, Posts.Post, At.Paths, At.Attachment) -> IO At.Attachment
|
||||
computeAttachmentHash (_, _, _, _, p, q) = do
|
||||
let f = At.file_path p
|
||||
|
||||
putStrLn $ "Reading " ++ f
|
||||
-- putStrLn $ show p
|
||||
-- putStrLn $ show (q { At.sha256_hash = "undefined" })
|
||||
|
||||
sha256_sum <- Hash.computeSHA256 f
|
||||
|
||||
putStrLn $ "SHA-256: " ++ unpack sha256_sum
|
||||
|
||||
phash :: Maybe Int64 <-
|
||||
case (At.mimetype q) `Set.member` phash_mimetypes of
|
||||
True -> do
|
||||
either_phash <- fileHash f
|
||||
case either_phash of
|
||||
Left err_str -> do
|
||||
putStrLn $ "Failed to compute phash for file " ++ (unpack sha256_sum) ++ " " ++ f ++ " " ++ err_str
|
||||
return Nothing
|
||||
Right phash_w -> do
|
||||
let phash_i = Words.wordToSignedInt64 phash_w
|
||||
|
||||
if phash_i == 0 then do
|
||||
putStrLn $ "phash is 0 for file " ++ (unpack sha256_sum) ++ " " ++ f
|
||||
return Nothing
|
||||
else do
|
||||
putStrLn $ "phash: " ++ show phash_w
|
||||
return $ Just $ Words.wordToSignedInt64 phash_w
|
||||
|
||||
False -> return Nothing
|
||||
|
||||
|
||||
return q
|
||||
{ At.sha256_hash = sha256_sum
|
||||
, At.phash = phash
|
||||
}
|
||||
|
||||
parseLegacyPaths :: JSONPosts.Post -> Maybe (At.Paths, At.Attachment)
|
||||
parseLegacyPaths post = do
|
||||
tim <- JSONPosts.tim post
|
||||
ext <- JSONPosts.ext post
|
||||
filename <- JSONPosts.filename post
|
||||
size <- JSONPosts.fsize post
|
||||
spoiler <- JSONPosts.fsize post
|
||||
|
||||
let
|
||||
board = JSONPosts.board post
|
||||
file_path = withPathPrefix $ board <> "/src/" <> tim <> ext
|
||||
thumbnail_path = withPathPrefix $ board <> "/thumb/" <> tim <> ext
|
||||
|
||||
p = At.Paths file_path thumbnail_path
|
||||
|
||||
mime = getMimeType ext
|
||||
|
||||
attachment = At.Attachment
|
||||
{ At.mimetype = mime
|
||||
, At.creation_time = undefined
|
||||
, At.sha256_hash = undefined
|
||||
, At.phash = Nothing
|
||||
, At.illegal = False
|
||||
, At.post_id = undefined
|
||||
, At.resolution = undefined
|
||||
, At.file_extension = Just $ T.drop 1 ext
|
||||
, At.thumb_extension = Just $ "png"
|
||||
, At.original_filename = Just $ filename <> ext
|
||||
, At.file_size_bytes = size
|
||||
, At.board_filename = tim
|
||||
, At.spoiler = spoiler > 0
|
||||
, At.attachment_idx = 1
|
||||
}
|
||||
|
||||
return (p, attachment)
|
||||
|
||||
|
||||
notDeleted :: (a, b, c, d, At.Paths, At.Attachment) -> Bool
|
||||
notDeleted (_, _, _, _, p, _) = not $ "deleted" `isSuffixOf` (At.file_path p)
|
||||
|
||||
|
||||
withPathPrefix :: Text -> FilePath
|
||||
withPathPrefix = ((<>) $ backup_read_root settings) . unpack
|
||||
|
||||
parseAttachments
|
||||
:: (Sites.Site, Boards.Board, Threads.Thread, JSONPosts.Post, Posts.Post)
|
||||
-> [(Sites.Site, Boards.Board, Threads.Thread, Posts.Post, At.Paths, At.Attachment)]
|
||||
parseAttachments (site, board, thread, p, q) = filter notDeleted $
|
||||
case JSONPosts.files p of
|
||||
Just files -> map
|
||||
(\(i, x) ->
|
||||
( site
|
||||
, board
|
||||
, thread
|
||||
, q
|
||||
, At.Paths (withPathPrefix $ JS.file_path x) (withPathPrefix $ JS.thumb_path x)
|
||||
, fileToAttachment i q x
|
||||
)
|
||||
) (zip [1..] files)
|
||||
Nothing ->
|
||||
case parseLegacyPaths p of
|
||||
Nothing -> []
|
||||
Just (paths, a) ->
|
||||
let
|
||||
dim = (JSONPosts.w p) >>= \w -> ((JSONPosts.h p) >>= \h -> Just $ At.Dimension w h)
|
||||
in
|
||||
( site
|
||||
, board
|
||||
, thread
|
||||
, q
|
||||
, paths
|
||||
, a
|
||||
{ At.creation_time = Posts.creation_time q
|
||||
, At.resolution = dim
|
||||
, At.post_id = fromJust $ Posts.post_id q
|
||||
}
|
||||
) : []
|
||||
|
||||
insertRecord
|
||||
:: Ord a
|
||||
=> (b -> a)
|
||||
-> Map.Map a [b]
|
||||
-> b
|
||||
-> Map.Map a [b]
|
||||
insertRecord getKey accMap x =
|
||||
let pid = getKey x
|
||||
l = Map.findWithDefault [] pid accMap
|
||||
in Map.insert pid (x : l) accMap
|
||||
|
||||
|
||||
createNewPosts
|
||||
:: JSONSettings
|
||||
-> [ (Threads.Thread, JSONPosts.Post, Client.PostId) ]
|
||||
-> IO [ Posts.Post ]
|
||||
createNewPosts settings tuples = do
|
||||
existing_post_results <- Client.getPosts settings $ map (\(_, _, c) -> c) tuples
|
||||
existing_posts <- either handleError return existing_post_results
|
||||
|
||||
thread_max_local_idx_result <- Client.getThreadMaxLocalIdx settings thread_ids
|
||||
thread_max_local_idxs <- either handleError return thread_max_local_idx_result
|
||||
|
||||
let existing_set :: Set (Int64, Int64) = Set.fromList (map (\x -> (Posts.thread_id x, Posts.board_post_id x)) existing_posts)
|
||||
|
||||
let to_insert_list :: [ (Threads.Thread, JSONPosts.Post, Client.PostId) ] =
|
||||
sortBy (comparing $ \(_, _, p) -> Client.board_post_id p) $
|
||||
newPosts tuples existing_set
|
||||
|
||||
-- Map of thread_id to the largest local_idx value (which would be the number of the last post in the thread)
|
||||
let local_idx :: Map.Map Int64 Int = Map.fromList thread_max_local_idxs
|
||||
|
||||
let insert_posts :: [ Posts.Post ] = fst $ foldl' foldFn ([], local_idx) to_insert_list
|
||||
|
||||
-- posts to insert are the posts that are not in existing_posts
|
||||
-- so we create a Set (thread_id, board_post_id) ✓
|
||||
-- then check every tuples against the set and the ones not in the set get added to a to_insert_list ✓
|
||||
-- also for every tuples we need to compute a local_idx
|
||||
-- so we create a Map index_map from thread_id to local_idx ✓
|
||||
-- - for existing_posts
|
||||
-- - need to compare posts already in the map with another post and keep the max local_idx ✓
|
||||
-- to get the new local_idx, we must order the to_insert_list by board_post_id, and look up each entry ✓
|
||||
|
||||
print insert_posts
|
||||
posts_result <- Client.postPosts settings insert_posts
|
||||
new_posts <- either handleError return posts_result
|
||||
return $ existing_posts ++ new_posts
|
||||
|
||||
where
|
||||
handleError err = print err >> exitFailure
|
||||
|
||||
thread_ids :: [ Int64 ]
|
||||
thread_ids = Set.elems $ Set.fromList $ map (\(t, _, _) -> Threads.thread_id t) tuples
|
||||
|
||||
newPosts :: [(Threads.Thread, JSONPosts.Post, Client.PostId)] -> Set (Int64, Int64) -> [(Threads.Thread, JSONPosts.Post, Client.PostId)]
|
||||
newPosts ts existing_set = filter (\(_, _, c) -> Set.notMember (Client.thread_id c, Client.board_post_id c) existing_set) ts
|
||||
|
||||
foldFn
|
||||
:: ([Posts.Post], Map.Map Int64 Int)
|
||||
-> (Threads.Thread, JSONPosts.Post, Client.PostId)
|
||||
-> ([Posts.Post], Map.Map Int64 Int)
|
||||
foldFn (posts, idx_map) (t, p, c) =
|
||||
case Map.lookup thread_id idx_map of
|
||||
Nothing -> (post 1 : posts, Map.insert thread_id 1 idx_map)
|
||||
Just i -> (post (i + 1) : posts, Map.insert thread_id (i + 1) idx_map)
|
||||
|
||||
where
|
||||
post :: Int -> Posts.Post
|
||||
post i = apiPostToArchivePost i t p
|
||||
|
||||
thread_id = Client.thread_id c
|
||||
|
||||
processBoard :: JSONSettings -> Sites.Site -> Boards.Board -> IO ()
|
||||
processBoard settings site board = do
|
||||
let catalogPath = backupDir </> "catalog.json"
|
||||
putStrLn $ "catalog file path: " ++ catalogPath
|
||||
|
||||
result <- parseJSONCatalog catalogPath
|
||||
|
||||
case result of
|
||||
Right catalogs -> do
|
||||
let threads_on_board = concatMap ((maybe [] id) . threads) catalogs
|
||||
|
||||
all_threads_for_board :: [ Threads.Thread ] <- ensureThreads settings board threads_on_board
|
||||
|
||||
all_posts_on_board :: [(Threads.Thread, [ JSONPosts.Post ])] <- mapM (readPosts settings board) all_threads_for_board
|
||||
|
||||
|
||||
let tuples :: [(Sites.Site, Boards.Board, Threads.Thread, JSONPosts.Post)] = concatMap
|
||||
(\(t, posts) -> map (\p -> (site, board, t, p)) posts)
|
||||
all_posts_on_board
|
||||
|
||||
posts_result :: [ Posts.Post ] <- createNewPosts settings (map (\(_, _, c, d) -> (c, d, apiPostToPostKey c d)) tuples)
|
||||
|
||||
putStrLn "Sum of post_ids:"
|
||||
print $ sum $ map (fromJust . Posts.post_id) posts_result
|
||||
putStrLn "Sum of board_post_ids:"
|
||||
print $ sum $ map Posts.board_post_id posts_result
|
||||
|
||||
let perfect_post_pairs = addPostsToTuples tuples posts_result
|
||||
|
||||
processFiles settings perfect_post_pairs
|
||||
|
||||
Left errMsg ->
|
||||
putStrLn $ "Failed to parse the JSON file in directory: "
|
||||
++ (Boards.pathpart board) ++ ". Error: " ++ errMsg
|
||||
|
||||
where
|
||||
backupDir :: FilePath
|
||||
backupDir = backup_read_root settings </> (Boards.pathpart board)
|
||||
|
||||
|
||||
processBackupDirectory :: JSONSettings -> IO ()
|
||||
processBackupDirectory settings = do
|
||||
putStrLn "JSON successfully read!"
|
||||
print settings -- print the decoded JSON settings
|
||||
site :: Sites.Site <- ensureSiteExists settings
|
||||
dirs <- listCatalogDirectories settings
|
||||
let dirsSet = Set.fromList dirs
|
||||
let site_id_ = Sites.site_id site
|
||||
boards_result <- Client.getSiteBoards settings site_id_
|
||||
putStrLn "Boards fetched!"
|
||||
|
||||
case boards_result of
|
||||
Left err -> do
|
||||
putStrLn $ "Error fetching boards: " ++ show err
|
||||
exitFailure
|
||||
Right archived_boards -> do
|
||||
let boardnames = map Boards.pathpart archived_boards
|
||||
created_boards <- createArchivesForNewBoards settings dirsSet boardnames site_id_
|
||||
let boards :: [ Boards.Board ] = archived_boards ++ created_boards
|
||||
let boards_we_have_data_for = filter (\board -> Set.member (Boards.pathpart board) dirsSet) boards
|
||||
mapM_ (processBoard settings site) boards_we_have_data_for
|
||||
|
||||
|
||||
-- TODO: detect saged threads by reading the bump time from the thread and comparing
|
||||
-- that time to the timestamp of the most recent post. If the post is newer
|
||||
-- - then the thread is being saged. Reasons it can be saged:
|
||||
-- - it's saged by a mod
|
||||
-- - the post has sage in the email field
|
||||
-- - the thread is full.
|
||||
--
|
||||
-- Better to support all those flags via the api: saged, locked, cyclical?, sticky
|
||||
-- - deleted could be there too
|
||||
-- - edited could be there too
|
||||
import Lib
|
||||
|
||||
main :: IO ()
|
||||
main = do
|
||||
|
|
|
@ -0,0 +1,16 @@
|
|||
module BoardQueueElem where
|
||||
|
||||
import Data.Time.Clock (UTCTime)
|
||||
|
||||
import SitesType (Site)
|
||||
import BoardsType (Board)
|
||||
|
||||
data BoardQueueElem = BoardQueueElem
|
||||
{ site :: Site
|
||||
, board :: Board
|
||||
, last_modified :: UTCTime
|
||||
} deriving (Show, Eq)
|
||||
|
||||
instance Ord BoardQueueElem where
|
||||
(<=) :: BoardQueueElem -> BoardQueueElem -> Bool
|
||||
a <= b = last_modified a >= last_modified b
|
|
@ -12,5 +12,5 @@ data Board = Board
|
|||
, name :: Maybe String
|
||||
, pathpart :: String
|
||||
, site_id :: Int
|
||||
} deriving (Show, Generic, FromJSON)
|
||||
} deriving (Show, Eq, Generic, FromJSON)
|
||||
|
||||
|
|
|
@ -1 +1 @@
|
|||
Subproject commit 62a23581e786f8564653406845c4b2a07d73deb6
|
||||
Subproject commit 88b5f0df7ea5e83a65a6c6153f197da7cd1c6217
|
|
@ -32,7 +32,7 @@ data Thread = Thread
|
|||
, locked :: Maybe Int
|
||||
, cyclical :: Maybe J.Cyclical
|
||||
, last_modified :: Int
|
||||
, board :: Text
|
||||
-- , board :: Text
|
||||
, files :: Maybe [J.File]
|
||||
, resto :: Int
|
||||
, unique_ips :: Maybe Int
|
||||
|
|
|
@ -23,7 +23,7 @@ data Post = Post
|
|||
, cyclical :: Maybe J.Cyclical
|
||||
, last_modified :: Int
|
||||
, embed :: Maybe Text
|
||||
, board :: Text
|
||||
-- , board :: Text
|
||||
, files :: Maybe [J.File]
|
||||
, resto :: Int
|
||||
, unique_ips :: Maybe Int
|
||||
|
|
|
@ -0,0 +1,777 @@
|
|||
{-# OPTIONS_GHC -Wno-unrecognised-pragmas #-}
|
||||
{-# HLINT ignore "Redundant bracket" #-}
|
||||
{-# HLINT ignore "Use fromMaybe" #-}
|
||||
{-# LANGUAGE OverloadedStrings #-}
|
||||
{-# LANGUAGE RecordWildCards #-}
|
||||
|
||||
module Lib where
|
||||
|
||||
import System.Exit
|
||||
import Data.Int (Int64)
|
||||
import Control.Monad (filterM)
|
||||
import System.Console.CmdArgs hiding (name)
|
||||
import System.Directory
|
||||
( listDirectory
|
||||
, doesFileExist
|
||||
, copyFile
|
||||
, createDirectoryIfMissing
|
||||
, removeFile
|
||||
)
|
||||
import System.FilePath ((</>), (<.>), takeExtension)
|
||||
import Data.List (find, isSuffixOf, foldl', sortBy)
|
||||
import Data.Ord (comparing)
|
||||
import qualified Data.Set as Set
|
||||
import Data.Set (Set)
|
||||
import Data.Time.Clock.POSIX (posixSecondsToUTCTime)
|
||||
import Data.Time.Clock (UTCTime)
|
||||
import qualified Data.Map as Map
|
||||
import Data.Maybe (fromJust, catMaybes)
|
||||
import Data.Text (Text, unpack)
|
||||
import qualified Data.Text as T
|
||||
import Data.Text.Encoding (decodeUtf8)
|
||||
import Network.Mime (defaultMimeLookup)
|
||||
import PerceptualHash (fileHash)
|
||||
import Control.Exception.Safe (tryAny, tryAsync, SomeException, displayException)
|
||||
import qualified Data.ByteString.Lazy as B
|
||||
import Data.Aeson (FromJSON)
|
||||
|
||||
import JSONParsing
|
||||
import qualified JSONCommonTypes as JS
|
||||
import qualified JSONPost as JSONPosts
|
||||
import qualified Network.DataClient as Client
|
||||
import qualified SitesType as Sites
|
||||
import qualified BoardsType as Boards
|
||||
import qualified ThreadType as Threads
|
||||
import qualified Common.AttachmentType as At
|
||||
import qualified Common.PostsType as Posts
|
||||
import qualified Hash
|
||||
import qualified Data.WordUtil as Words
|
||||
import Common.Server.JSONSettings as J
|
||||
import Common.Network.HttpClient (HttpError)
|
||||
import qualified Common.Server.ConsumerSettings as CS
|
||||
|
||||
newtype SettingsCLI = SettingsCLI
|
||||
{ jsonFile :: FilePath
|
||||
} deriving (Show, Data, Typeable)
|
||||
|
||||
|
||||
-- Move a file by reading, writing, and then deleting the original
|
||||
moveFile :: FilePath -> FilePath -> IO ()
|
||||
moveFile src dst =
|
||||
B.readFile src >>= B.writeFile dst >> removeFile src
|
||||
|
||||
|
||||
listCatalogDirectories :: JSONSettings -> IO [ FilePath ]
|
||||
listCatalogDirectories settings = do
|
||||
allDirs <- listDirectory (backup_read_root settings)
|
||||
let filteredDirs = filter (`notElem` excludedDirs) allDirs
|
||||
filterM hasCatalog filteredDirs
|
||||
|
||||
where
|
||||
excludedDirs = ["sfw", "alt", "overboard"]
|
||||
|
||||
hasCatalog dir = do
|
||||
let catalogPath = backup_read_root settings </> dir </> "catalog.json"
|
||||
doesFileExist catalogPath
|
||||
|
||||
|
||||
ensureSiteExists :: JSONSettings -> Either HttpError [ Sites.Site ] -> IO Sites.Site
|
||||
ensureSiteExists settings sitesResult = do
|
||||
case sitesResult of
|
||||
Right siteList ->
|
||||
case find (\site -> Sites.name site == site_name settings) siteList of
|
||||
Just site -> do
|
||||
putStrLn $ site_name settings ++ " already exists!"
|
||||
return site
|
||||
Nothing -> do
|
||||
putStrLn $ site_name settings ++ " does not exist. Creating..."
|
||||
postResult <- Client.postSite settings
|
||||
|
||||
case postResult of
|
||||
Right (site:_) -> do
|
||||
putStrLn $ "Successfully created " ++ site_name settings ++ ". " ++ show site
|
||||
return site
|
||||
Right [] -> do
|
||||
putStrLn "Did not get new site id back from postgrest"
|
||||
exitFailure
|
||||
Left err -> do
|
||||
putStrLn $ "Failed to create " ++ site_name settings
|
||||
++ " Error: " ++ show err
|
||||
exitFailure
|
||||
|
||||
Left err -> do
|
||||
putStrLn $ "Error fetching sites: " ++ show err
|
||||
exitFailure
|
||||
|
||||
|
||||
createArchivesForNewBoards
|
||||
:: JSONSettings
|
||||
-> Set String
|
||||
-> [ String ]
|
||||
-> Int
|
||||
-> IO [ Boards.Board ]
|
||||
createArchivesForNewBoards settings dirsSet archived_boards siteid = do
|
||||
let archivedBoardsSet = Set.fromList archived_boards
|
||||
|
||||
-- Find boards that are in dirs but not in archived_boards
|
||||
let boardsToArchive = dirsSet `Set.difference` archivedBoardsSet
|
||||
|
||||
putStrLn $ "Creating " ++ (show $ length boardsToArchive) ++ " boards:"
|
||||
mapM_ putStrLn boardsToArchive
|
||||
|
||||
if Set.null boardsToArchive
|
||||
then return []
|
||||
else do
|
||||
post_result <- Client.postBoards settings (Set.toList boardsToArchive) siteid
|
||||
|
||||
case post_result of
|
||||
Left err -> do
|
||||
putStrLn $ "Error posting boards: " ++ show err
|
||||
exitFailure
|
||||
Right boards -> do
|
||||
putStrLn "Created the following boards:"
|
||||
mapM_ (putStrLn . Boards.pathpart) boards
|
||||
return boards
|
||||
|
||||
|
||||
apiThreadToArchiveThread :: Int -> Thread -> Threads.Thread
|
||||
apiThreadToArchiveThread board_id_ json_thread =
|
||||
Threads.Thread
|
||||
{ Threads.thread_id = undefined
|
||||
, Threads.board_thread_id = no json_thread
|
||||
, Threads.creation_time = epochToUTCTime $ fromIntegral (time json_thread)
|
||||
, Threads.board_id = board_id_
|
||||
}
|
||||
|
||||
epochToUTCTime :: Int -> UTCTime
|
||||
epochToUTCTime = posixSecondsToUTCTime . realToFrac
|
||||
|
||||
|
||||
createArchivesForNewThreads
|
||||
:: JSONSettings
|
||||
-> [ Thread ]
|
||||
-> [ Threads.Thread ]
|
||||
-> Boards.Board
|
||||
-> IO [ Threads.Thread ]
|
||||
createArchivesForNewThreads settings all_threads archived_threads board = do
|
||||
putStrLn $ "Creating " ++ show (length threads_to_create) ++ " threads."
|
||||
threads_result <- Client.postThreads settings (map (apiThreadToArchiveThread board_id) threads_to_create)
|
||||
|
||||
case threads_result of
|
||||
Left err -> do
|
||||
putStrLn $ "Error creating threads: " ++ show err
|
||||
exitFailure
|
||||
Right new_threads -> return new_threads
|
||||
|
||||
where
|
||||
board_id :: Int = Boards.board_id board
|
||||
|
||||
archived_board_thread_ids :: Set.Set Int
|
||||
archived_board_thread_ids =
|
||||
Set.fromList $ map Threads.board_thread_id archived_threads
|
||||
|
||||
threads_to_create :: [ Thread ]
|
||||
threads_to_create =
|
||||
filter
|
||||
((`Set.notMember` archived_board_thread_ids) . no)
|
||||
all_threads
|
||||
|
||||
|
||||
ensureThreads :: JSONSettings -> Boards.Board -> [ Thread ] -> IO [ Threads.Thread ]
|
||||
ensureThreads settings board all_threads = do
|
||||
threads_result <- Client.getThreads settings (Boards.board_id board) (map no all_threads)
|
||||
|
||||
case threads_result of
|
||||
Left err -> do
|
||||
putStrLn $ "Error fetching threads: " ++ show err
|
||||
exitFailure
|
||||
Right archived_threads -> do
|
||||
putStrLn $ show (length archived_threads) ++ " threads already exist."
|
||||
new_threads <- createArchivesForNewThreads settings all_threads archived_threads board
|
||||
return $ archived_threads ++ new_threads
|
||||
|
||||
|
||||
readPosts
|
||||
:: FileGetters
|
||||
-> Sites.Site
|
||||
-> Boards.Board
|
||||
-> Threads.Thread
|
||||
-> IO (Threads.Thread, [ JSONPosts.Post ])
|
||||
readPosts FileGetters {..} site board thread = do
|
||||
result <- getJSONPosts site relative_path
|
||||
|
||||
case result of
|
||||
Left err -> do
|
||||
putStrLn $ "Failed to parse the JSON file " ++ relative_path ++ " error: " ++ err
|
||||
putStrLn $ "Site: " ++ show site
|
||||
return (thread, [])
|
||||
Right posts_wrapper -> return (thread, JSONPosts.posts posts_wrapper)
|
||||
|
||||
where
|
||||
relative_path :: FilePath
|
||||
relative_path = Boards.pathpart board </> "res" </> (show (Threads.board_thread_id thread) ++ ".json")
|
||||
|
||||
|
||||
apiPostToPostKey :: Threads.Thread -> JSONPosts.Post -> Client.PostId
|
||||
apiPostToPostKey thread post =
|
||||
Client.PostId
|
||||
{ Client.thread_id = (Threads.thread_id thread)
|
||||
, Client.board_post_id = (JSONPosts.no post)
|
||||
}
|
||||
|
||||
|
||||
-- Convert Post to DbPost
|
||||
apiPostToArchivePost :: Int -> Threads.Thread -> JSONPosts.Post -> Posts.Post
|
||||
apiPostToArchivePost local_idx thread post =
|
||||
Posts.Post
|
||||
{ Posts.post_id = Nothing
|
||||
, Posts.board_post_id = JSONPosts.no post
|
||||
, Posts.creation_time = posixSecondsToUTCTime (realToFrac $ JSONPosts.time post)
|
||||
, Posts.body = JSONPosts.com post
|
||||
, Posts.name = JSONPosts.name post
|
||||
, Posts.subject = JSONPosts.sub post
|
||||
, Posts.email = JSONPosts.email post
|
||||
, Posts.thread_id = Threads.thread_id thread
|
||||
, Posts.embed = JSONPosts.embed post
|
||||
, Posts.local_idx = local_idx
|
||||
}
|
||||
|
||||
|
||||
-- | A version of 'concatMap' that works with a monadic predicate.
|
||||
-- Stolen from package extra Control.Monad.Extra
|
||||
concatMapM :: Monad m => (a -> m [b]) -> [a] -> m [b]
|
||||
{-# INLINE concatMapM #-}
|
||||
concatMapM op = foldr f (pure [])
|
||||
where f x xs = do
|
||||
x_ <- op x
|
||||
|
||||
if null x_
|
||||
then xs
|
||||
else do
|
||||
xs_ <- xs
|
||||
pure $ x_ ++ xs_
|
||||
|
||||
|
||||
addPostsToTuples
|
||||
:: [(Sites.Site, Boards.Board, Threads.Thread, JSONPosts.Post)]
|
||||
-> [ Posts.Post ]
|
||||
-> [(Sites.Site, Boards.Board, Threads.Thread, JSONPosts.Post, Posts.Post)]
|
||||
addPostsToTuples tuples posts = map f posts
|
||||
where
|
||||
post_map :: Map.Map (Int64, Int64) (Sites.Site, Boards.Board, Threads.Thread, JSONPosts.Post)
|
||||
post_map = Map.fromList (map (\(a, b, c, d) -> ((Threads.thread_id c, JSONPosts.no d), (a, b, c, d))) tuples)
|
||||
|
||||
f :: Posts.Post -> (Sites.Site, Boards.Board, Threads.Thread, JSONPosts.Post, Posts.Post)
|
||||
f new_post =
|
||||
(\(a, b, c, d) -> (a, b, c, d, new_post))
|
||||
(post_map Map.! (Posts.thread_id new_post, Posts.board_post_id new_post))
|
||||
|
||||
|
||||
fileToAttachment :: Int -> Posts.Post -> JS.File -> At.Attachment
|
||||
fileToAttachment i post file =
|
||||
At.Attachment
|
||||
{ At.mimetype = maybe guessed_mime id (JS.mime file)
|
||||
, At.creation_time = Posts.creation_time post
|
||||
, At.sha256_hash = undefined
|
||||
, At.phash = Nothing
|
||||
, At.illegal = False
|
||||
, At.post_id = fromJust $ Posts.post_id post
|
||||
, At.resolution = dim
|
||||
, At.file_extension = Just extension
|
||||
, At.thumb_extension = Just thumb_extension
|
||||
, At.original_filename = Just $ JS.filename file <> "." <> extension
|
||||
, At.file_size_bytes = JS.fsize file
|
||||
, At.board_filename = JS.id file
|
||||
, At.spoiler = maybe False id $ JS.spoiler file
|
||||
, At.attachment_idx = i
|
||||
}
|
||||
|
||||
where
|
||||
extension = T.filter (/= '.') $ JS.ext file
|
||||
|
||||
thumb_extension = T.pack $ drop 1 $ takeExtension $ unpack $ JS.thumb_path file
|
||||
|
||||
guessed_mime = getMimeType extension
|
||||
|
||||
dim = (JS.w file) >>= \w ->
|
||||
((JS.h file) >>= \h ->
|
||||
Just $ At.Dimension w h)
|
||||
|
||||
|
||||
getMimeType :: Text -> Text
|
||||
getMimeType ext = decodeUtf8 $ defaultMimeLookup ext
|
||||
|
||||
|
||||
phash_mimetypes :: Set.Set Text
|
||||
phash_mimetypes = Set.fromList
|
||||
[ "image/jpeg"
|
||||
, "image/png"
|
||||
, "image/gif"
|
||||
]
|
||||
|
||||
|
||||
copyOrMoveFiles :: JSONSettings -> FileGetters -> Details -> IO ()
|
||||
copyOrMoveFiles settings fgs (site, board, thread, _, path, attachment) = do
|
||||
(copyOrMove fgs) common_dest (src, dest) (thumb_src, thumb_dest)
|
||||
|
||||
-- src = (At.file_path | At.thumb_path)
|
||||
-- dest = <media_root>/<website_name>/<boardpart>/<board_thread_id>/<sha>.<ext>
|
||||
|
||||
where
|
||||
src :: FilePath
|
||||
src = At.file_path path
|
||||
|
||||
thumb_src :: Maybe FilePath
|
||||
thumb_src = At.thumbnail_path path
|
||||
|
||||
dest :: FilePath
|
||||
dest = common_dest
|
||||
</> (unpack $ At.board_filename attachment)
|
||||
<.> (unpack $ fromJust $ At.file_extension attachment)
|
||||
|
||||
thumb_dest :: FilePath
|
||||
thumb_dest = common_dest
|
||||
</> "thumbnail_" <> (unpack $ At.board_filename attachment)
|
||||
<.> (unpack $ fromJust $ At.thumb_extension attachment)
|
||||
|
||||
common_dest :: FilePath
|
||||
common_dest
|
||||
= (media_root_path settings)
|
||||
</> Sites.name site
|
||||
</> Boards.pathpart board
|
||||
</> (show $ Threads.board_thread_id thread)
|
||||
|
||||
|
||||
type Details = (Sites.Site, Boards.Board, Threads.Thread, Posts.Post, At.Paths, At.Attachment)
|
||||
|
||||
|
||||
processFiles
|
||||
:: JSONSettings
|
||||
-> FileGetters
|
||||
-> [(Sites.Site, Boards.Board, Threads.Thread, JSONPosts.Post, Posts.Post)]
|
||||
-> IO ()
|
||||
processFiles settings fgs tuples = do -- perfect just means that our posts have ids, they're already inserted into the db
|
||||
let ps = map (\(_, _, _, _, x) -> x) tuples
|
||||
|
||||
existing_attachments_result <- Client.getAttachments settings (map (fromJust . Posts.post_id) ps)
|
||||
|
||||
case existing_attachments_result of
|
||||
Left err -> do
|
||||
putStrLn $ "Error fetching attachments: " ++ show err
|
||||
exitFailure
|
||||
Right existing_attachments -> do
|
||||
let map_existing :: Map.Map (Int64, Text) [ At.Attachment ] =
|
||||
foldl'
|
||||
(insertRecord (\a -> (At.post_id a, At.board_filename a)))
|
||||
Map.empty
|
||||
existing_attachments
|
||||
|
||||
let attachments_on_board :: [ Details ] =
|
||||
concatMap parseAttachments tuples
|
||||
-- attachments_on_board are the only files that can be copied into the archive dir right now
|
||||
-- since that's where we have the src filename. except here the Attachment doesn't have a sha hash yet
|
||||
-- so we can't build the destination filename.
|
||||
|
||||
let map_should_exist :: Map.Map (Int64, Text) [ Details ] =
|
||||
foldl'
|
||||
(insertRecord (\(_, _, _, _, _, a) -> (At.post_id a, At.board_filename a)))
|
||||
Map.empty
|
||||
attachments_on_board
|
||||
|
||||
let to_insert_map =
|
||||
Map.filterWithKey
|
||||
(\k _ -> not $ k `Map.member` map_existing)
|
||||
map_should_exist
|
||||
|
||||
let to_insert = foldr (++) [] $ Map.elems to_insert_map
|
||||
|
||||
to_insert_ <- mapM ensureAttachmentExists to_insert
|
||||
|
||||
let to_insert_exist = catMaybes to_insert_
|
||||
|
||||
with_hashes <- mapM computeAttachmentHash to_insert_exist
|
||||
|
||||
attachments_result <- Client.postAttachments settings with_hashes
|
||||
|
||||
case attachments_result of
|
||||
Left err -> do
|
||||
putStrLn $ "Error posting attachments: " ++ show err
|
||||
exitFailure
|
||||
|
||||
Right saved -> do
|
||||
putStrLn $ "Saved " ++ (show $ length saved) ++ " attachments!"
|
||||
mapM_ (copyOrMoveFiles settings fgs) to_insert_exist
|
||||
|
||||
where
|
||||
ensureAttachmentExists :: Details -> IO (Maybe Details)
|
||||
ensureAttachmentExists (a, b, c, d, p, f) =
|
||||
(attachmentPaths fgs) p >>=
|
||||
return . (maybe Nothing (\x -> Just (a, b, c, d, x, f)))
|
||||
|
||||
computeAttachmentHash :: Details -> IO At.Attachment
|
||||
computeAttachmentHash (_, _, _, _, p, q) = do
|
||||
let f = At.file_path p
|
||||
|
||||
putStrLn $ "Reading " ++ f
|
||||
|
||||
sha256_sum <- Hash.computeSHA256 f
|
||||
|
||||
putStrLn $ "SHA-256: " ++ unpack sha256_sum
|
||||
|
||||
phash :: Maybe Int64 <-
|
||||
case (At.mimetype q) `Set.member` phash_mimetypes of
|
||||
True -> do
|
||||
putStrLn $ "Running tryAny $ fileHash f " ++ f
|
||||
either_exception <- tryAny $ fileHash f
|
||||
putStrLn $ "Done tryAny $ fileHash f " ++ f
|
||||
|
||||
case either_exception of
|
||||
Left (err :: SomeException) -> do
|
||||
putStrLn $ "Error while computing the perceptual hash of file " ++ f ++ " " ++ displayException err
|
||||
return Nothing
|
||||
Right either_phash ->
|
||||
case either_phash of
|
||||
Left err_str -> do
|
||||
putStrLn $ "Failed to compute phash for file " ++ (unpack sha256_sum) ++ " " ++ f ++ " " ++ err_str
|
||||
return Nothing
|
||||
Right phash_w -> do
|
||||
result <- tryAsync $ do
|
||||
let phash_i = Words.wordToSignedInt64 phash_w
|
||||
|
||||
if phash_i == 0 then do
|
||||
putStrLn $ "phash is 0 for file " ++ (unpack sha256_sum) ++ " " ++ f
|
||||
return Nothing
|
||||
else do
|
||||
putStrLn $ "phash: " ++ show phash_w
|
||||
return $ Just $ Words.wordToSignedInt64 phash_w
|
||||
|
||||
case result of
|
||||
Left (err2 :: SomeException) -> do
|
||||
putStrLn $ "Error handling phash result! " ++ displayException err2
|
||||
return Nothing
|
||||
|
||||
Right w -> return w
|
||||
|
||||
False -> return Nothing
|
||||
|
||||
|
||||
return q
|
||||
{ At.sha256_hash = sha256_sum
|
||||
, At.phash = phash
|
||||
}
|
||||
|
||||
parseLegacyPaths :: Boards.Board -> JSONPosts.Post -> Maybe (At.Paths, At.Attachment)
|
||||
parseLegacyPaths board post = do
|
||||
tim <- JSONPosts.tim post
|
||||
ext <- JSONPosts.ext post
|
||||
filename <- JSONPosts.filename post
|
||||
size <- JSONPosts.fsize post
|
||||
spoiler <- JSONPosts.fsize post
|
||||
|
||||
let
|
||||
board_pathpart = T.pack $ Boards.pathpart board
|
||||
file_path = (withPathPrefix "") </> (T.unpack $ board_pathpart <> "/src/" <> tim <> ext)
|
||||
thumb_extension = "png"
|
||||
thumbnail_path = (withPathPrefix "") </> (T.unpack $ board_pathpart <> "/thumb/" <> tim <> "." <> thumb_extension)
|
||||
|
||||
p = At.Paths file_path (Just thumbnail_path)
|
||||
|
||||
mime = getMimeType ext
|
||||
|
||||
attachment = At.Attachment
|
||||
{ At.mimetype = mime
|
||||
, At.creation_time = undefined
|
||||
, At.sha256_hash = undefined
|
||||
, At.phash = Nothing
|
||||
, At.illegal = False
|
||||
, At.post_id = undefined
|
||||
, At.resolution = undefined
|
||||
, At.file_extension = Just $ T.drop 1 ext
|
||||
, At.thumb_extension = Just $ thumb_extension
|
||||
, At.original_filename = Just $ filename <> ext
|
||||
, At.file_size_bytes = size
|
||||
, At.board_filename = tim
|
||||
, At.spoiler = spoiler > 0
|
||||
, At.attachment_idx = 1
|
||||
}
|
||||
|
||||
return (p, attachment)
|
||||
|
||||
|
||||
notDeleted :: (a, b, c, d, At.Paths, At.Attachment) -> Bool
|
||||
notDeleted (_, _, _, _, p, _) = not $ "deleted" `isSuffixOf` (At.file_path p)
|
||||
|
||||
withPathPrefix :: Text -> FilePath
|
||||
withPathPrefix = (addPathPrefix fgs) . unpack
|
||||
|
||||
parseAttachments
|
||||
:: (Sites.Site, Boards.Board, Threads.Thread, JSONPosts.Post, Posts.Post)
|
||||
-> [ Details ]
|
||||
parseAttachments (site, board, thread, p, q) = filter notDeleted $
|
||||
case JSONPosts.files p of
|
||||
Just files -> map
|
||||
(\(i, x) ->
|
||||
( site
|
||||
, board
|
||||
, thread
|
||||
, q
|
||||
, At.Paths (withPathPrefix $ JS.file_path x) (Just $ withPathPrefix $ JS.thumb_path x)
|
||||
, fileToAttachment i q x
|
||||
)
|
||||
) (zip [1..] files)
|
||||
Nothing ->
|
||||
case parseLegacyPaths board p of
|
||||
Nothing -> []
|
||||
Just (paths, a) ->
|
||||
let
|
||||
dim = (JSONPosts.w p) >>= \w -> ((JSONPosts.h p) >>= \h -> Just $ At.Dimension w h)
|
||||
in
|
||||
( site
|
||||
, board
|
||||
, thread
|
||||
, q
|
||||
, paths
|
||||
, a
|
||||
{ At.creation_time = Posts.creation_time q
|
||||
, At.resolution = dim
|
||||
, At.post_id = fromJust $ Posts.post_id q
|
||||
}
|
||||
) : []
|
||||
|
||||
insertRecord
|
||||
:: Ord a
|
||||
=> (b -> a)
|
||||
-> Map.Map a [b]
|
||||
-> b
|
||||
-> Map.Map a [b]
|
||||
insertRecord getKey accMap x =
|
||||
let pid = getKey x
|
||||
l = Map.findWithDefault [] pid accMap
|
||||
in Map.insert pid (x : l) accMap
|
||||
|
||||
|
||||
createNewPosts
|
||||
:: JSONSettings
|
||||
-> [ (Threads.Thread, JSONPosts.Post, Client.PostId) ]
|
||||
-> IO [ Posts.Post ]
|
||||
createNewPosts settings tuples = do
|
||||
existing_post_results <- Client.getPosts settings $ map (\(_, _, c) -> c) tuples
|
||||
existing_posts <- either handleError return existing_post_results
|
||||
|
||||
thread_max_local_idx_result <- Client.getThreadMaxLocalIdx settings thread_ids
|
||||
thread_max_local_idxs <- either handleError return thread_max_local_idx_result
|
||||
|
||||
let existing_set :: Set (Int64, Int64) = Set.fromList (map (\x -> (Posts.thread_id x, Posts.board_post_id x)) existing_posts)
|
||||
|
||||
let to_insert_list :: [ (Threads.Thread, JSONPosts.Post, Client.PostId) ] =
|
||||
sortBy (comparing $ \(_, _, p) -> Client.board_post_id p) $
|
||||
newPosts tuples existing_set
|
||||
|
||||
-- Map of thread_id to the largest local_idx value (which would be the number of the last post in the thread)
|
||||
let local_idx :: Map.Map Int64 Int = Map.fromList thread_max_local_idxs
|
||||
|
||||
let insert_posts :: [ Posts.Post ] = fst $ foldl' foldFn ([], local_idx) to_insert_list
|
||||
|
||||
-- posts to insert are the posts that are not in existing_posts
|
||||
-- so we create a Set (thread_id, board_post_id) ✓
|
||||
-- then check every tuples against the set and the ones not in the set get added to a to_insert_list ✓
|
||||
-- also for every tuples we need to compute a local_idx
|
||||
-- so we create a Map index_map from thread_id to local_idx ✓
|
||||
-- - for existing_posts
|
||||
-- - need to compare posts already in the map with another post and keep the max local_idx ✓
|
||||
-- to get the new local_idx, we must order the to_insert_list by board_post_id, and look up each entry ✓
|
||||
|
||||
print insert_posts
|
||||
posts_result <- Client.postPosts settings insert_posts
|
||||
new_posts <- either handleError return posts_result
|
||||
return $ existing_posts ++ new_posts
|
||||
|
||||
where
|
||||
handleError err = print err >> exitFailure
|
||||
|
||||
thread_ids :: [ Int64 ]
|
||||
thread_ids = Set.elems $ Set.fromList $ map (\(t, _, _) -> Threads.thread_id t) tuples
|
||||
|
||||
newPosts :: [(Threads.Thread, JSONPosts.Post, Client.PostId)] -> Set (Int64, Int64) -> [(Threads.Thread, JSONPosts.Post, Client.PostId)]
|
||||
newPosts ts existing_set = filter (\(_, _, c) -> Set.notMember (Client.thread_id c, Client.board_post_id c) existing_set) ts
|
||||
|
||||
foldFn
|
||||
:: ([Posts.Post], Map.Map Int64 Int)
|
||||
-> (Threads.Thread, JSONPosts.Post, Client.PostId)
|
||||
-> ([Posts.Post], Map.Map Int64 Int)
|
||||
foldFn (posts, idx_map) (t, p, c) =
|
||||
case Map.lookup thread_id idx_map of
|
||||
Nothing -> (post 1 : posts, Map.insert thread_id 1 idx_map)
|
||||
Just i -> (post (i + 1) : posts, Map.insert thread_id (i + 1) idx_map)
|
||||
|
||||
where
|
||||
post :: Int -> Posts.Post
|
||||
post i = apiPostToArchivePost i t p
|
||||
|
||||
thread_id = Client.thread_id c
|
||||
|
||||
|
||||
data FileGetters = FileGetters
|
||||
{ getJSONCatalog :: Sites.Site -> String -> IO (Either String [ Catalog ])
|
||||
, getJSONPosts :: Sites.Site -> String -> IO (Either String JSONPosts.PostWrapper)
|
||||
, addPathPrefix :: String -> String
|
||||
, attachmentPaths :: At.Paths -> IO (Maybe At.Paths)
|
||||
, copyOrMove :: String -> (String, String) -> (Maybe String, String) -> IO ()
|
||||
}
|
||||
|
||||
|
||||
localFileGetters :: JSONSettings -> FileGetters
|
||||
localFileGetters settings = FileGetters
|
||||
{ getJSONCatalog = const $ parseJSONCatalog . withRoot
|
||||
, getJSONPosts = const $ parsePosts . withRoot
|
||||
, addPathPrefix = ((++) $ backup_read_root settings)
|
||||
, attachmentPaths = \p -> do
|
||||
exists <- doesFileExist (At.file_path p)
|
||||
if exists then return (Just p) else return Nothing
|
||||
, copyOrMove = \common_dest (src, dest) (m_thumb_src, thumb_dest) -> do
|
||||
destination_exists <- doesFileExist dest
|
||||
|
||||
if not destination_exists
|
||||
then do
|
||||
src_exists <- doesFileExist src
|
||||
|
||||
createDirectoryIfMissing True common_dest
|
||||
|
||||
if src_exists
|
||||
then putStrLn ("Copying " ++ src) >> copyFile src dest
|
||||
else return ()
|
||||
|
||||
case m_thumb_src of
|
||||
Nothing -> return ()
|
||||
Just thumb_src -> do
|
||||
thumb_exists <- doesFileExist thumb_src
|
||||
|
||||
if thumb_exists
|
||||
then putStrLn ("Copying " ++ thumb_src) >> copyFile thumb_src thumb_dest
|
||||
else return ()
|
||||
|
||||
else return ()
|
||||
}
|
||||
|
||||
where
|
||||
withRoot = (backup_read_root settings </>)
|
||||
|
||||
|
||||
processBoard :: JSONSettings -> FileGetters -> Sites.Site -> Boards.Board -> IO ()
|
||||
processBoard settings fgs@FileGetters {..} site board = do
|
||||
let catalogPath = Boards.pathpart board </> "catalog.json"
|
||||
putStrLn $ "catalog file path: " ++ catalogPath
|
||||
|
||||
result <- getJSONCatalog site catalogPath
|
||||
|
||||
case result of
|
||||
Right (catalogs :: [ Catalog ]) -> do
|
||||
let threads_on_board = concatMap ((maybe [] id) . threads) catalogs
|
||||
|
||||
all_threads_for_board :: [ Threads.Thread ] <- ensureThreads settings board threads_on_board
|
||||
|
||||
all_posts_on_board :: [(Threads.Thread, [ JSONPosts.Post ])] <- mapM (readPosts fgs site board) all_threads_for_board
|
||||
|
||||
let tuples :: [(Sites.Site, Boards.Board, Threads.Thread, JSONPosts.Post)] = concatMap
|
||||
(\(t, posts) -> map (\p -> (site, board, t, p)) posts)
|
||||
all_posts_on_board
|
||||
|
||||
posts_result :: [ Posts.Post ] <- createNewPosts settings (map (\(_, _, c, d) -> (c, d, apiPostToPostKey c d)) tuples)
|
||||
|
||||
putStrLn "Sum of post_ids:"
|
||||
print $ sum $ map (fromJust . Posts.post_id) posts_result
|
||||
putStrLn "Sum of board_post_ids:"
|
||||
print $ sum $ map Posts.board_post_id posts_result
|
||||
|
||||
let perfect_post_pairs = addPostsToTuples tuples posts_result
|
||||
|
||||
processFiles settings fgs perfect_post_pairs
|
||||
|
||||
Left errMsg ->
|
||||
putStrLn $ "Failed to parse the JSON file in directory: "
|
||||
++ (Boards.pathpart board) ++ ". Error: " ++ errMsg
|
||||
|
||||
|
||||
getBoards :: JSONSettings -> [ FilePath ] -> IO (Sites.Site, [ Boards.Board ])
|
||||
getBoards settings board_names = do
|
||||
sitesResult <- Client.getAllSites settings
|
||||
site :: Sites.Site <- ensureSiteExists settings sitesResult
|
||||
|
||||
let boardsSet = Set.fromList board_names
|
||||
let site_id_ = Sites.site_id site
|
||||
boards_result <- Client.getSiteBoards settings site_id_
|
||||
putStrLn "Boards fetched!"
|
||||
|
||||
case boards_result of
|
||||
Left err -> do
|
||||
putStrLn $ "Error fetching boards: " ++ show err
|
||||
exitFailure
|
||||
Right archived_boards -> do
|
||||
let boardnames = map Boards.pathpart archived_boards
|
||||
created_boards <- createArchivesForNewBoards settings boardsSet boardnames site_id_
|
||||
let boards :: [ Boards.Board ] = archived_boards ++ created_boards
|
||||
let boards_we_have_data_for = filter (\board -> Set.member (Boards.pathpart board) boardsSet) boards
|
||||
return (site, boards_we_have_data_for)
|
||||
|
||||
|
||||
processBoards :: JSONSettings -> FileGetters -> [ FilePath ] -> IO ()
|
||||
processBoards settings fgs board_names =
|
||||
getBoards settings board_names >>= \(site, boards) ->
|
||||
mapM_ (processBoard settings fgs site) boards
|
||||
|
||||
|
||||
processBackupDirectory :: JSONSettings -> IO ()
|
||||
processBackupDirectory settings = do
|
||||
putStrLn "JSON successfully read!"
|
||||
print settings -- print the decoded JSON settings
|
||||
boards <- listCatalogDirectories settings
|
||||
processBoards settings (localFileGetters settings) boards
|
||||
|
||||
|
||||
toClientSettings :: CS.ConsumerJSONSettings -> CS.JSONSiteSettings -> J.JSONSettings
|
||||
toClientSettings CS.ConsumerJSONSettings {..} CS.JSONSiteSettings {..} =
|
||||
J.JSONSettings
|
||||
{ J.postgrest_url = postgrest_url
|
||||
, J.jwt = jwt
|
||||
, J.backup_read_root = undefined
|
||||
, J.media_root_path = media_root_path
|
||||
, J.site_name = name
|
||||
, J.site_url = root_url
|
||||
}
|
||||
|
||||
|
||||
httpGetJSON :: (FromJSON a) => Sites.Site -> String -> IO (Either String a)
|
||||
httpGetJSON site path = (Client.getJSON $ Sites.url site </> path)
|
||||
>>= getErrMsg
|
||||
where
|
||||
getErrMsg :: Either Client.HttpError a -> IO (Either String a)
|
||||
getErrMsg (Left err) = return $ Left $ show err
|
||||
getErrMsg (Right x) = return $ Right x
|
||||
|
||||
httpFileGetters :: J.JSONSettings -> FileGetters
|
||||
httpFileGetters settings = FileGetters
|
||||
{ getJSONCatalog = httpGetJSON
|
||||
, getJSONPosts = httpGetJSON
|
||||
, addPathPrefix = ((++) $ J.site_url settings)
|
||||
-- attachmentPaths here actually doesn't get the paths of the attachment,
|
||||
-- it downloads them into a temporary file and gets that path of that.
|
||||
, attachmentPaths = \paths -> do
|
||||
filepath <- Client.getFile (At.file_path paths)
|
||||
m_thumbpath <- case At.thumbnail_path paths of
|
||||
Nothing -> return Nothing
|
||||
Just thumbpath -> Client.getFile thumbpath
|
||||
|
||||
return $ filepath >>= \fp ->
|
||||
case m_thumbpath of
|
||||
Nothing -> return (At.Paths fp Nothing)
|
||||
tp -> return (At.Paths fp tp)
|
||||
|
||||
, copyOrMove = \common_dest (src, dest) (m_thumb_src, thumb_dest) -> do
|
||||
putStrLn $ "Copy Or Move (Move) src: " ++ src ++ " dest: " ++ dest
|
||||
createDirectoryIfMissing True common_dest
|
||||
moveFile src dest
|
||||
|
||||
case m_thumb_src of
|
||||
Nothing -> return ()
|
||||
Just thumb_src -> moveFile thumb_src thumb_dest
|
||||
}
|
|
@ -0,0 +1,69 @@
|
|||
{-# LANGUAGE RecordWildCards #-}
|
||||
{-# OPTIONS_GHC -Wno-unrecognised-pragmas #-}
|
||||
{-# HLINT ignore "Use when" #-}
|
||||
|
||||
module Main (main) where
|
||||
|
||||
import System.Exit (exitFailure)
|
||||
import qualified Data.ByteString.Lazy as B
|
||||
import System.Console.CmdArgs (cmdArgs, Data, Typeable)
|
||||
import Data.Aeson (decode)
|
||||
import Control.Concurrent.Async (mapConcurrently_)
|
||||
|
||||
import Common.Server.ConsumerSettings
|
||||
import Lib
|
||||
( processBoards
|
||||
, toClientSettings
|
||||
, httpFileGetters
|
||||
)
|
||||
import Sync
|
||||
|
||||
newtype CliArgs = CliArgs
|
||||
{ settingsFile :: String
|
||||
} deriving (Show, Data, Typeable)
|
||||
|
||||
getSettings :: IO ConsumerJSONSettings
|
||||
getSettings = do
|
||||
cliArgs <- cmdArgs $ CliArgs "consumer_settings.json"
|
||||
|
||||
let filePath = settingsFile cliArgs
|
||||
if null filePath
|
||||
then do
|
||||
putStrLn "Error: No JSON settings file provided."
|
||||
exitFailure
|
||||
else do
|
||||
putStrLn $ "Loading settings from: " ++ filePath
|
||||
content <- B.readFile filePath
|
||||
case decode content :: Maybe ConsumerJSONSettings of
|
||||
Nothing -> do
|
||||
putStrLn "Error: Invalid JSON format."
|
||||
exitFailure
|
||||
Just settings -> return settings
|
||||
|
||||
|
||||
processWebsite :: ConsumerJSONSettings -> JSONSiteSettings -> IO ()
|
||||
processWebsite settings site_settings = do
|
||||
let client_settings = toClientSettings settings site_settings
|
||||
processBoards client_settings (httpFileGetters client_settings) (boards site_settings)
|
||||
return ()
|
||||
|
||||
main :: IO ()
|
||||
main = do
|
||||
settings <- getSettings
|
||||
print settings
|
||||
|
||||
_ <- if http_fill_all settings
|
||||
then do
|
||||
putStrLn "Starting web backfill"
|
||||
mapConcurrently_ (processWebsite settings) (websites settings)
|
||||
putStrLn "Finished web backfill"
|
||||
else return ()
|
||||
|
||||
if http_sync_continously settings
|
||||
then do
|
||||
putStrLn "Starting web sync loop"
|
||||
syncWebsites settings
|
||||
else return ()
|
||||
|
||||
putStrLn "Done. Quitting."
|
||||
|
|
@ -1,5 +1,7 @@
|
|||
{-# LANGUAGE OverloadedStrings #-}
|
||||
{-# LANGUAGE DeriveAnyClass #-}
|
||||
{-# OPTIONS_GHC -Wno-unrecognised-pragmas #-}
|
||||
{-# HLINT ignore "Use <&>" #-}
|
||||
|
||||
module Network.DataClient
|
||||
( HttpError(..)
|
||||
|
@ -17,6 +19,9 @@ module Network.DataClient
|
|||
, postPosts
|
||||
, getAttachments
|
||||
, postAttachments
|
||||
, getJSON
|
||||
, getFile
|
||||
, getLatestPostsPerBoard
|
||||
) where
|
||||
|
||||
import Control.Monad (forM)
|
||||
|
@ -36,6 +41,8 @@ import Data.Aeson
|
|||
, Value
|
||||
)
|
||||
import GHC.Generics
|
||||
import System.IO.Temp (openBinaryTempFile, getCanonicalTemporaryDirectory)
|
||||
import System.IO (hClose)
|
||||
|
||||
import qualified Common.Server.JSONSettings as T
|
||||
import qualified SitesType as Sites
|
||||
|
@ -45,6 +52,7 @@ import qualified Common.AttachmentType as Attachments
|
|||
import qualified Common.PostsType as Posts
|
||||
import Common.Network.HttpClient
|
||||
import qualified Network.DataClientTypes as T
|
||||
import qualified Network.GetLatestPostsPerBoardResponse as GLPPBR
|
||||
|
||||
|
||||
data PostId = PostId
|
||||
|
@ -214,4 +222,34 @@ eitherDecodeResponse (Left err) = Left err
|
|||
eitherDecodeResponse (Right bs) =
|
||||
case eitherDecode bs of
|
||||
Right val -> Right val
|
||||
Left err -> Left $ StatusCodeError 500 $ LC8.pack $ "Failed to decode JSON: " ++ err ++ " " ++ (show bs)
|
||||
Left err -> Left $ StatusCodeError 500 $ LC8.pack $ "Failed to decode JSON: " ++ err ++ " " ++ show bs
|
||||
|
||||
|
||||
getJSON :: (FromJSON a) => String -> IO (Either HttpError a)
|
||||
getJSON url = get_ url [] >>= return . eitherDecodeResponse
|
||||
|
||||
|
||||
getFile :: String -> IO (Maybe String)
|
||||
getFile url = do
|
||||
putStrLn $ "getFile " ++ url
|
||||
result <- get_ url []
|
||||
|
||||
case result of
|
||||
Left (err :: HttpError) -> do
|
||||
putStrLn $ "getFile " ++ url ++ " Error!"
|
||||
print err
|
||||
return Nothing
|
||||
Right lbs -> do
|
||||
putStrLn $ "getFile " ++ url ++ " SUCCESS!"
|
||||
tmp_root <- getCanonicalTemporaryDirectory
|
||||
(tmp_filepath, tmp_filehandle) <- openBinaryTempFile tmp_root "chan.attachment"
|
||||
putStrLn $ "Created " ++ tmp_filepath
|
||||
putStrLn "Writing attachment..."
|
||||
LBS.hPut tmp_filehandle lbs
|
||||
hClose tmp_filehandle
|
||||
return $ Just tmp_filepath
|
||||
|
||||
|
||||
getLatestPostsPerBoard :: T.JSONSettings -> IO (Either HttpError [ GLPPBR.GetLatestPostsPerBoardResponse ])
|
||||
getLatestPostsPerBoard settings =
|
||||
post settings "/rpc/get_latest_posts_per_board" mempty False >>= return . eitherDecodeResponse
|
||||
|
|
|
@ -10,4 +10,3 @@ data ThreadMaxIdx = ThreadMaxIdx
|
|||
{ thread_id :: Int64
|
||||
, max_idx :: Int
|
||||
} deriving (Show, Generic, FromJSON)
|
||||
|
||||
|
|
|
@ -0,0 +1,20 @@
|
|||
{-# LANGUAGE DeriveAnyClass #-}
|
||||
|
||||
module Network.GetLatestPostsPerBoardResponse
|
||||
where
|
||||
|
||||
import Data.Int (Int64)
|
||||
import Data.Time.Clock (UTCTime)
|
||||
import Data.Aeson (FromJSON)
|
||||
import GHC.Generics
|
||||
|
||||
data GetLatestPostsPerBoardResponse = GetLatestPostsPerBoardResponse
|
||||
{ board_id :: Int
|
||||
, site_id :: Int
|
||||
, pathpart :: String
|
||||
, post_id :: Maybe Int64
|
||||
, board_post_id :: Int64
|
||||
, creation_time :: UTCTime
|
||||
, thread_id :: Int64
|
||||
, board_thread_id :: Integer
|
||||
} deriving (Show, Generic, FromJSON)
|
|
@ -0,0 +1,101 @@
|
|||
{-
|
||||
The purpose of this module is to provide a way to store things
|
||||
in order (for example as most recent), take the nth item in
|
||||
the order out, and put a new item in. In O(log n) time.
|
||||
|
||||
There is also a function to choose a random integer, but favours lower ones.
|
||||
Together, this lets us process data in a safe way asynchronously,
|
||||
where if we keep choosing a skewed random item to process all of the items
|
||||
eventually get processed. No two threads will process the same thing at the
|
||||
same time, and more popular things get processed more often.
|
||||
-}
|
||||
|
||||
module PriorityQueue
|
||||
( Elem (..)
|
||||
, Queue
|
||||
, take
|
||||
, put
|
||||
, selectSkewedIndex
|
||||
, main
|
||||
)
|
||||
where
|
||||
|
||||
import Prelude hiding (splitAt, take, min, max, elem)
|
||||
import Data.Set hiding (take, foldr, map)
|
||||
import Data.Ord (comparing)
|
||||
import System.Random (StdGen, getStdGen, randomR)
|
||||
import Data.List (sort, group)
|
||||
|
||||
|
||||
data Elem a = Elem
|
||||
{ priority :: Int
|
||||
, element :: a
|
||||
}
|
||||
|
||||
instance Ord (Elem a) where
|
||||
compare = comparing priority
|
||||
|
||||
|
||||
instance Eq (Elem a) where
|
||||
(==) x y = priority x == priority y
|
||||
|
||||
|
||||
type Queue a = Set a
|
||||
|
||||
|
||||
take :: (Ord a) => Int -> Queue a -> (a, Queue a)
|
||||
take n set =
|
||||
let (_, greater) = splitAt (size set - n - 1) set
|
||||
elem = findMin greater
|
||||
in (elem, delete elem set)
|
||||
|
||||
|
||||
put :: (Ord a) => a -> Queue a -> Queue a
|
||||
put = insert
|
||||
|
||||
|
||||
-- Simplified function to generate a number linearly skewed towards the start of the range
|
||||
linearSkewRandom :: Double -> Double -> StdGen -> (Double, StdGen)
|
||||
linearSkewRandom min max rng =
|
||||
let (u, rng') = randomR (0.0, 1.0) rng
|
||||
-- skewedValue = min + (u ** 2) * (max - min)
|
||||
skewedValue = (min - 0.5) + (u ** 2) * (max - min + 0.5)
|
||||
-- skewedValue = (min - 0.5) + (1 - sqrt u) * (max - min + 0.5)
|
||||
-- skewedValue = min + (1 - sqrt u) * (max - min)
|
||||
in (skewedValue, rng')
|
||||
|
||||
|
||||
-- Function to select an index from 0 to n-1 with a linear skew towards lower numbers
|
||||
selectSkewedIndex :: Int -> StdGen -> (Int, StdGen)
|
||||
selectSkewedIndex n rng =
|
||||
let max = fromIntegral (n - 1)
|
||||
(randValue, newRng) = linearSkewRandom 0 max rng
|
||||
in (ceiling randValue, newRng)
|
||||
|
||||
|
||||
main :: IO ()
|
||||
main = do
|
||||
stdGen <- getStdGen
|
||||
--putStrLn "Hello World"
|
||||
|
||||
-- let i = fst $ selectSkewedIndex (size q) stdGen
|
||||
-- let x = fst $ take i q
|
||||
-- print (i, priority x)
|
||||
|
||||
let rs = foldr f ([], stdGen) ([1..100000] :: [ Int ])
|
||||
mapM_ pf $ countOccurrences $ fst rs
|
||||
|
||||
where
|
||||
pf :: (Show a, Show b) => (a, b) -> IO ()
|
||||
pf (a, b) = putStrLn $ show a ++ "," ++ show b
|
||||
|
||||
f _ (xs, gen) =
|
||||
let (x, newgen) = selectSkewedIndex (size q) gen
|
||||
in (x:xs, newgen)
|
||||
|
||||
q :: Queue (Elem Int)
|
||||
q = fromList [ Elem i undefined | i <- [1..100] ]
|
||||
|
||||
countOccurrences :: (Eq a, Ord a) => [a] -> [(a, Int)]
|
||||
countOccurrences = map (\x -> (head x, length x)) . group . sort
|
||||
|
|
@ -11,5 +11,5 @@ data Site = Site
|
|||
{ site_id :: Int
|
||||
, name :: String
|
||||
, url :: String
|
||||
} deriving (Show, Generic, FromJSON)
|
||||
} deriving (Show, Eq, Generic, FromJSON)
|
||||
|
||||
|
|
|
@ -0,0 +1,157 @@
|
|||
{-# LANGUAGE RecordWildCards #-}
|
||||
|
||||
module Sync where
|
||||
|
||||
import System.Exit (exitFailure)
|
||||
import qualified Data.Map as Map
|
||||
import qualified Data.Set as Set
|
||||
import Data.Maybe (mapMaybe)
|
||||
-- import Control.Monad.Trans.Except (ExceptT, runExceptT)
|
||||
|
||||
import qualified Common.Server.ConsumerSettings as S
|
||||
import qualified Common.Server.JSONSettings as JS
|
||||
import qualified Network.DataClient as Client
|
||||
import qualified Lib
|
||||
import qualified Network.GetLatestPostsPerBoardResponse as GLPPBR
|
||||
import qualified SitesType as Site
|
||||
import qualified BoardsType as Board
|
||||
import qualified BoardQueueElem as QE
|
||||
import qualified PriorityQueue as PQ
|
||||
|
||||
consumerSettingsToPartialJSONSettings :: S.ConsumerJSONSettings -> JS.JSONSettings
|
||||
consumerSettingsToPartialJSONSettings S.ConsumerJSONSettings {..} =
|
||||
JS.JSONSettings
|
||||
{ JS.postgrest_url = postgrest_url
|
||||
, JS.jwt = jwt
|
||||
, backup_read_root = undefined
|
||||
, JS.media_root_path
|
||||
, site_name = undefined
|
||||
, site_url = undefined
|
||||
}
|
||||
|
||||
syncWebsites :: S.ConsumerJSONSettings -> IO ()
|
||||
syncWebsites consumer_settings = do
|
||||
putStrLn "Starting channel web synchronization."
|
||||
|
||||
let json_settings = consumerSettingsToPartialJSONSettings consumer_settings
|
||||
|
||||
sitesResult <- Client.getAllSites json_settings
|
||||
|
||||
sites <- mapM (flip Lib.ensureSiteExists sitesResult . Lib.toClientSettings consumer_settings) (S.websites consumer_settings)
|
||||
|
||||
print sites
|
||||
|
||||
latest_posts_per_board_results <- Client.getLatestPostsPerBoard json_settings
|
||||
|
||||
latest_posts_per_board <- case latest_posts_per_board_results of
|
||||
Left e -> do
|
||||
putStrLn $ "Error getting board information: " ++ show e
|
||||
exitFailure
|
||||
Right latest_posts_per_board -> return latest_posts_per_board
|
||||
|
||||
print latest_posts_per_board
|
||||
|
||||
let boards_per_site :: Map.Map Int [ String ] =
|
||||
foldl
|
||||
(\m b ->
|
||||
let key = GLPPBR.site_id b
|
||||
pathpart = GLPPBR.pathpart b
|
||||
in
|
||||
|
||||
Map.insertWith (++) key [ pathpart ] m
|
||||
)
|
||||
Map.empty
|
||||
latest_posts_per_board
|
||||
|
||||
let board_id_to_last_modified = Map.fromList $
|
||||
map
|
||||
(\b -> (GLPPBR.board_id b, GLPPBR.creation_time b))
|
||||
latest_posts_per_board
|
||||
|
||||
let site_name_to_site :: Map.Map String Site.Site =
|
||||
Map.fromList $ map (\s -> (Site.name s, s)) sites
|
||||
|
||||
let site_id_board_id_to_glppbr = Map.fromList $
|
||||
map
|
||||
(\b -> ((GLPPBR.site_id b, GLPPBR.pathpart b), b))
|
||||
latest_posts_per_board
|
||||
|
||||
site_and_board_list_ <- mapM
|
||||
(\site_settings -> do
|
||||
let site = (Map.!) site_name_to_site (S.name site_settings)
|
||||
let s_id = Site.site_id site
|
||||
|
||||
let existing_board_info =
|
||||
mapMaybe
|
||||
(\board_pathpart ->
|
||||
Map.lookup (s_id, board_pathpart) site_id_board_id_to_glppbr
|
||||
)
|
||||
(S.boards site_settings)
|
||||
|
||||
let existing_boards =
|
||||
map
|
||||
(\b -> Board.Board
|
||||
{ Board.board_id = GLPPBR.board_id b
|
||||
, Board.name = Nothing
|
||||
, Board.pathpart = GLPPBR.pathpart b
|
||||
, Board.site_id = GLPPBR.site_id b
|
||||
}
|
||||
)
|
||||
existing_board_info
|
||||
|
||||
boards <- Lib.createArchivesForNewBoards
|
||||
(Lib.toClientSettings consumer_settings site_settings)
|
||||
(Set.fromList $ S.boards site_settings)
|
||||
((Map.!) boards_per_site s_id)
|
||||
s_id
|
||||
|
||||
return (site, existing_boards ++ boards)
|
||||
|
||||
)
|
||||
(S.websites consumer_settings)
|
||||
|
||||
let site_and_board_list = concatMap (\(a, bs) -> map (\b -> (a, b)) bs) site_and_board_list_
|
||||
|
||||
let queue_elems =
|
||||
map
|
||||
(\(site, board) -> QE.BoardQueueElem
|
||||
{ QE.site = site
|
||||
, QE.board = board
|
||||
, QE.last_modified =
|
||||
(Map.!)
|
||||
board_id_to_last_modified
|
||||
(Board.board_id board)
|
||||
}
|
||||
)
|
||||
site_and_board_list
|
||||
|
||||
let pq :: PQ.Queue QE.BoardQueueElem = Set.fromList queue_elems
|
||||
|
||||
print pq
|
||||
|
||||
-- we have our boards last modified timestamps
|
||||
-- get list of boards per site
|
||||
|
||||
-- first we need all the (Site, Board) tuples ✓
|
||||
-- perhaps we even want all (Site, Board, Thread) ✓
|
||||
-- But then we don't load the posts of each thread, instead only do
|
||||
-- that for threads which change,
|
||||
-- - which means after we get all the threads
|
||||
-- - enter a loop where you
|
||||
-- - pick a board
|
||||
-- - compare the threads online to memory
|
||||
-- - load only the changed/new ones
|
||||
-- - put board back
|
||||
|
||||
|
||||
-- NEW TODO:
|
||||
-- - ensure that sites in the settings exist in the database! ✓
|
||||
-- - ensure that boards per site in the settings exist in the database! ✓
|
||||
-- - finish using ExceptT and use sites, latest_posts_per_board to populate
|
||||
-- our PriorityQueue
|
||||
-- - write event loop that
|
||||
-- - get pq from stm shared value
|
||||
-- - uses the pq (there was something about the timestamps in the pq having to be reversed btw)
|
||||
-- - ensures threads
|
||||
-- - has a value that should be added to the pq
|
||||
-- - uses stm to update pq shared value
|
Loading…
Reference in New Issue