Compare commits

...

19 Commits

Author SHA1 Message Date
towards-a-new-leftypol 3ba873b984 Sync minor refactor - Remove a level of indent 2025-02-05 05:17:50 -05:00
towards-a-new-leftypol bffc851999 Sync - create priority queue 2025-02-05 05:14:24 -05:00
towards-a-new-leftypol 6a557c7c5d Sync - ensure we have all the boards 2025-02-04 23:44:38 -05:00
towards-a-new-leftypol 430a199190 sync ensures sites 2025-02-04 22:22:22 -05:00
towards-a-new-leftypol 1113539321 Sync starts by getting latest post for each board 2025-01-30 18:00:42 -05:00
towards-a-new-leftypol 518467c7eb Add sql function to get the last post on each board 2025-01-29 22:50:28 -05:00
towards-a-new-leftypol 7bf61c0dd2 fetch_catalog should be in sql/initialize.sql
- copy it from the experimental remake_fetch_catalog.sql script
2025-01-29 18:27:57 -05:00
towards-a-new-leftypol 34753c176a Move more things into Lib from Main 2024-04-17 07:44:34 -04:00
towards-a-new-leftypol d3495a9d2d Stub in Sync module to eventually keep the db in sync with sites 2024-04-17 07:11:20 -04:00
towards-a-new-leftypol 04abd71582 Implement a simple priority queue mechanism
- pseudo-random index generator to that favours priority indices
2024-04-16 23:49:33 -04:00
towards-a-new-leftypol 8fcea9c84b Explicitly make attachment thumbnail optional 2024-04-12 02:15:42 -04:00
towards-a-new-leftypol cb2da26c64 Fix legacy style attachment file paths 2024-04-11 14:43:15 -04:00
towards-a-new-leftypol 359869984e Handle errors when creating phash 2024-04-11 13:47:44 -04:00
towards-a-new-leftypol 93d789fa65 Can now update archive from url 2024-04-09 21:01:06 -04:00
towards-a-new-leftypol fc321d8531 Abstract out everywhere that references reading src files from local dir
- Now we have two binaries: one to read from local dir one from http
    - both work on the same json api files
2024-04-09 19:40:29 -04:00
towards-a-new-leftypol 67870cab36 Add a way to save a file by doing a GET request over http to DataClient 2024-04-08 02:14:06 -04:00
towards-a-new-leftypol 1c6c1250e3 Begin generalizing backfill code to use either local or http functions 2024-04-05 19:07:20 -04:00
towards-a-new-leftypol 0086dab7f8 Move everything except for main method from Backfill into Lib (for reuse by consumer Main module) 2024-04-05 01:31:26 -04:00
towards-a-new-leftypol 2588724b8c Build another binary, parse settings 2024-04-03 19:57:13 -04:00
19 changed files with 1455 additions and 678 deletions

View File

@ -2,7 +2,7 @@
"postgrest_url": "http://localhost:3000",
"jwt": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJyb2xlIjoiY2hhbl9hcmNoaXZlciJ9.rGIKZokTDKTuQLIv8138bUby5PELfDipYYIDpJzH02c",
"backup_read_root": "/home/phil/linixy/tmp/leftypol_back/lainchan.leftypol.org",
"media_root_path": "/home/phil/linixy/tmp/chan_archive_media",
"media_root_path": "/home/phil/linixy/tmp/chan_archive_media2/archive",
"site_name": "leftychan",
"site_url": "https://leftychan.net"
}

View File

@ -66,6 +66,7 @@ executable chan-delorean
-- Modules included in this executable, other than Main.
other-modules:
Lib
JSONParsing
SitesType
BoardsType
@ -79,7 +80,9 @@ executable chan-delorean
Data.WordUtil
Network.DataClient
Network.DataClientTypes
Network.GetLatestPostsPerBoardResponse
Common.Server.JSONSettings
Common.Server.ConsumerSettings
-- LANGUAGE extensions used by modules in this package.
-- other-extensions:
@ -101,7 +104,71 @@ executable chan-delorean
cryptonite,
memory,
mime-types,
perceptual-hash
perceptual-hash,
temporary
-- Directories containing source files.
hs-source-dirs: src
-- Base language which the package is written in.
default-language: GHC2021
executable chan-delorean-consoomer
-- Import common warning flags.
import: warnings
-- .hs or .lhs file containing the Main module.
main-is: Main.hs
-- Modules included in this executable, other than Main.
other-modules:
Lib
JSONParsing
SitesType
BoardsType
ThreadType
JSONPost
JSONCommonTypes
Common.PostsType
Common.AttachmentType
Common.Network.HttpClient
Hash
Data.WordUtil
Network.DataClient
Network.DataClientTypes
Network.GetLatestPostsPerBoardResponse
Common.Server.ConsumerSettings
Common.Server.JSONSettings
PriorityQueue
Sync
BoardQueueElem
-- LANGUAGE extensions used by modules in this package.
-- other-extensions:
-- Other library packages from which modules are imported.
build-depends: base,
aeson,
bytestring,
cmdargs,
directory,
filepath,
containers,
text,
http-client,
http-conduit,
safe-exceptions,
http-types,
time,
cryptonite,
memory,
mime-types,
perceptual-hash,
async,
temporary,
stm,
random
-- transformers
-- Directories containing source files.
hs-source-dirs: src

17
consumer_settings.json Normal file
View File

@ -0,0 +1,17 @@
{
"websites": [
{
"name": "example",
"root_url": "https://example.net",
"boards": [
"tech",
"meta"
]
}
],
"postgrest_url": "http://localhost:3000",
"jwt": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJyb2xlIjoiY2hhbl9hcmNoaXZlciJ9.rGIKZokTDKTuQLIv8138bUby5PELfDipYYIDpJzH02c",
"media_root_path": "/home/phil/linixy/tmp/chan_archive_media_repaired/archive",
"http_fill_all": false,
"http_sync_continously": true
}

View File

@ -222,3 +222,99 @@ SELECT * FROM boards;
SELECT * FROM threads WHERE thread_id = 11314;
ANALYZE posts;
SELECT count(*) from attachments;
SELECT * FROM attachments WHERE post_id = 253383;
SELECT * from attachments WHERE board_filename = '1722466065515';
SELECT count(*) attachments WHERE attachment_id < (SELECT attachment_id FROM attachments WHERE board_filename = '1722466065515');
SELECT max(attachment_id) FROM attachments a;
SELECT pg_get_serial_sequence('attachments', 'attachment_id');
SELECT setval(pg_get_serial_sequence('attachments', 'attachment_id'), COALESCE(198853, 1), true);
UPDATE attachments SET thumb_extension = 'png'
WHERE
attachment_id IN
(
SELECT a.attachment_id
FROM attachments a
JOIN posts p ON a.post_id = p.post_id
JOIN threads t ON p.thread_id = t.thread_id
JOIN boards b ON t.board_id = b.board_id
JOIN sites s ON b.site_id = s.site_id
WHERE s.name = 'leftychan'
AND a.thumb_extension = 'jpg'
);
SELECT * FROM posts WHERE board_post_id = 129;
SELECT * FROM attachments WHERE post_id = 461287;
SELECT count(a.*)
FROM attachments a
JOIN posts p ON a.post_id = p.post_id
JOIN threads t ON p.thread_id = t.thread_id
JOIN boards b ON t.board_id = b.board_id
JOIN sites s ON b.site_id = s.site_id
WHERE s.name = 'leftychan'
AND a.thumb_extension = 'jpg';
SELECT * FROM posts
JOIN threads ON threads.thread_id = posts.thread_id
JOIN boards ON boards.board_id = threads.board_id
WHERE boards.pathpart = 'leftypol'
AND boards.site_id = 1
ORDER BY posts.creation_time DESC
LIMIT 1;
SELECT * FROM posts
ORDER BY posts.creation_time DESC
LIMIT 1;
SELECT boards.board_id, boards.pathpart, sites.name FROM boards JOIN sites ON sites.site_id = boards.site_id;
SELECT DISTINCT ON (b.board_id)
b.board_id,
b.site_id,
b.pathpart,
p.post_id,
p.board_post_id,
p.creation_time,
p.body,
t.thread_id,
t.board_thread_id
FROM boards b
JOIN threads t ON t.board_id = b.board_id
JOIN posts p ON p.thread_id = t.thread_id
ORDER BY b.board_id, p.creation_time DESC;
CREATE OR REPLACE FUNCTION get_latest_posts_per_board()
RETURNS TABLE (
board_id int,
site_id int,
pathpart text,
post_id bigint,
board_post_id bigint,
creation_time timestamp with time zone,
thread_id bigint,
board_thread_id bigint
) AS $$
SELECT DISTINCT ON (b.board_id)
b.board_id,
b.site_id,
b.pathpart,
p.post_id,
p.board_post_id,
p.creation_time,
t.thread_id,
t.board_thread_id
FROM boards b
JOIN threads t ON t.board_id = b.board_id
JOIN posts p ON p.thread_id = t.thread_id
ORDER BY b.board_id, p.creation_time DESC;
$$ LANGUAGE sql STABLE;
SELECT * FROM get_latest_posts_per_board();

View File

@ -21,6 +21,7 @@ DROP TYPE IF EXISTS post_key CASCADE;
DROP FUNCTION IF EXISTS update_post_body_search_index;
DROP FUNCTION IF EXISTS fetch_top_threads;
DROP FUNCTION IF EXISTS fetch_catalog;
DROP FUNCTION IF EXISTS get_latest_posts_per_board;
-- It won't let us drop roles otherwise and the IFs are to keep this script idempotent.
@ -222,7 +223,7 @@ CREATE OR REPLACE FUNCTION fetch_top_threads(
lookback INT DEFAULT 10000
)
RETURNS TABLE(bump_time TIMESTAMPTZ, post_count BIGINT, thread_id BIGINT, where_to_leave_off TIMESTAMPTZ)
LANGUAGE sql
LANGUAGE sql STABLE
AS $$
SELECT
max(creation_time) as bump_time,
@ -266,6 +267,51 @@ CREATE TYPE catalog_grid_result AS
);
CREATE OR REPLACE FUNCTION fetch_catalog(max_time timestamptz, max_row_read int DEFAULT 10000)
RETURNS SETOF catalog_grid_result AS $$
WITH
top AS
(
SELECT * FROM fetch_top_threads(max_time, max_row_read) AS top
),
tall_posts AS
(
SELECT
top.post_count AS estimated_post_count,
posts.post_id,
posts.board_post_id,
posts.creation_time,
top.bump_time,
posts.body,
posts.subject,
posts.thread_id,
posts.embed
FROM top
JOIN posts ON top.thread_id = posts.thread_id AND posts.local_idx = 1
WHERE creation_time < max_time
)
SELECT
-- post_counts.post_count,
tall_posts.*,
threads.board_thread_id, -- this should be part of the url path when creating links, not thread_id (that's internal)
boards.pathpart,
sites."name",
-- sites.site_id,
attachments.mimetype AS file_mimetype,
attachments.illegal AS file_illegal,
-- attachments.resolution AS file_resolution,
attachments.board_filename AS file_name,
attachments.file_extension,
attachments.thumb_extension AS file_thumb_extension
FROM tall_posts
JOIN threads ON tall_posts.thread_id = threads.thread_id
JOIN boards ON threads.board_id = boards.board_id
JOIN sites ON sites.site_id = boards.site_id
LEFT OUTER JOIN attachments ON attachments.post_id = tall_posts.post_id AND attachments.attachment_idx = 1
ORDER BY bump_time DESC;
$$ LANGUAGE sql STABLE;
-- Function: search_posts
--
-- This function performs a full-text search on the `posts` table using PostgreSQL's text search features.
@ -355,6 +401,33 @@ RETURNS SETOF catalog_grid_result AS $$
$$ LANGUAGE sql STABLE;
CREATE OR REPLACE FUNCTION get_latest_posts_per_board()
RETURNS TABLE (
board_id int,
site_id int,
pathpart text,
post_id bigint,
board_post_id bigint,
creation_time timestamp with time zone,
thread_id bigint,
board_thread_id bigint
) AS $$
SELECT DISTINCT ON (b.board_id)
b.board_id,
b.site_id,
b.pathpart,
p.post_id,
p.board_post_id,
p.creation_time,
t.thread_id,
t.board_thread_id
FROM boards b
JOIN threads t ON t.board_id = b.board_id
JOIN posts p ON p.thread_id = t.thread_id
ORDER BY b.board_id, p.creation_time DESC;
$$ LANGUAGE sql STABLE;
/*
* Permissions
*/
@ -364,18 +437,20 @@ REVOKE EXECUTE ON FUNCTION fetch_catalog FROM PUBLIC;
REVOKE EXECUTE ON FUNCTION search_posts FROM PUBLIC;
REVOKE EXECUTE ON FUNCTION update_post_body_search_index FROM PUBLIC;
REVOKE EXECUTE ON FUNCTION get_posts FROM PUBLIC;
REVOKE EXECUTE ON FUNCTION get_latest_posts_per_board FROM PUBLIC;
CREATE ROLE chan_archive_anon nologin;
GRANT CONNECT ON DATABASE chan_archives TO chan_archive_anon;
GRANT SELECT ON sites TO chan_archive_anon;
GRANT SELECT ON boards TO chan_archive_anon;
GRANT SELECT ON threads TO chan_archive_anon;
GRANT SELECT ON posts TO chan_archive_anon;
GRANT SELECT ON attachments TO chan_archive_anon;
GRANT EXECUTE ON FUNCTION fetch_catalog TO chan_archive_anon;
GRANT EXECUTE ON FUNCTION fetch_top_threads TO chan_archive_anon;
GRANT EXECUTE ON FUNCTION search_posts TO chan_archive_anon;
GRANT EXECUTE ON FUNCTION get_posts TO chan_archive_anon;
GRANT CONNECT ON DATABASE chan_archives TO chan_archive_anon;
GRANT SELECT ON sites TO chan_archive_anon;
GRANT SELECT ON boards TO chan_archive_anon;
GRANT SELECT ON threads TO chan_archive_anon;
GRANT SELECT ON posts TO chan_archive_anon;
GRANT SELECT ON attachments TO chan_archive_anon;
GRANT EXECUTE ON FUNCTION fetch_catalog TO chan_archive_anon;
GRANT EXECUTE ON FUNCTION fetch_top_threads TO chan_archive_anon;
GRANT EXECUTE ON FUNCTION search_posts TO chan_archive_anon;
GRANT EXECUTE ON FUNCTION get_posts TO chan_archive_anon;
GRANT EXECUTE ON FUNCTION get_latest_posts_per_board TO chan_archive_anon;
-- GRANT usage, select ON SEQUENCE sites_site_id_seq TO chan_archive_anon;
-- GRANT usage, select ON SEQUENCE boards_board_id_seq TO chan_archive_anon;
@ -396,6 +471,7 @@ GRANT EXECUTE ON FUNCTION fetch_top_threads TO chan_archiver;
GRANT EXECUTE ON FUNCTION fetch_catalog TO chan_archiver;
GRANT EXECUTE ON FUNCTION search_posts TO chan_archiver;
GRANT EXECUTE ON FUNCTION get_posts TO chan_archiver;
GRANT EXECUTE ON FUNCTION get_latest_posts_per_board TO chan_archiver;
GRANT usage, select ON SEQUENCE sites_site_id_seq TO chan_archiver;
GRANT usage, select ON SEQUENCE boards_board_id_seq TO chan_archiver;
GRANT usage, select ON SEQUENCE threads_thread_id_seq TO chan_archiver;

View File

@ -1,668 +1,12 @@
{-# OPTIONS_GHC -Wno-unrecognised-pragmas #-}
{-# HLINT ignore "Redundant bracket" #-}
{-# HLINT ignore "Use fromMaybe" #-}
{-# LANGUAGE OverloadedStrings #-}
module Main where
import System.Exit
import Data.Int (Int64)
import Control.Monad (filterM)
import Data.Aeson (decode)
import qualified Data.ByteString.Lazy as B
import Data.Aeson (decode)
import System.Console.CmdArgs
import System.Directory
( listDirectory
, doesFileExist
, copyFile
, createDirectoryIfMissing
)
import System.FilePath ((</>), (<.>), takeExtension)
import Data.List (find, isSuffixOf, foldl', sortBy)
import Data.Ord (comparing)
import qualified Data.Set as Set
import Data.Set (Set)
import Data.Time.Clock.POSIX (posixSecondsToUTCTime)
import Data.Time.Clock (UTCTime)
import qualified Data.Map as Map
import Data.Maybe (fromJust)
import Data.Text (Text, unpack)
import qualified Data.Text as T
import Data.Text.Encoding (decodeUtf8)
import Network.Mime (defaultMimeLookup)
import PerceptualHash (fileHash)
import JSONParsing
import Common.Server.JSONSettings
import qualified JSONCommonTypes as JS
import qualified JSONPost as JSONPosts
import qualified Network.DataClient as Client
import qualified SitesType as Sites
import qualified BoardsType as Boards
import qualified ThreadType as Threads
import qualified Common.AttachmentType as At
import qualified Common.PostsType as Posts
import qualified Hash as Hash
import qualified Data.WordUtil as Words
newtype SettingsCLI = SettingsCLI
{ jsonFile :: FilePath
} deriving (Show, Data, Typeable)
listCatalogDirectories :: JSONSettings -> IO [ FilePath ]
listCatalogDirectories settings = do
allDirs <- listDirectory (backup_read_root settings)
let filteredDirs = filter (`notElem` excludedDirs) allDirs
filterM hasCatalog filteredDirs
where
excludedDirs = ["sfw", "alt", "overboard"]
hasCatalog dir = do
let catalogPath = backup_read_root settings </> dir </> "catalog.json"
doesFileExist catalogPath
ensureSiteExists :: JSONSettings -> IO Sites.Site
ensureSiteExists settings = do
sitesResult <- Client.getAllSites settings
case sitesResult of
Right siteList ->
case find (\site -> Sites.name site == site_name settings) siteList of
Just site -> do
putStrLn $ site_name settings ++ " already exists!"
return site
Nothing -> do
putStrLn $ site_name settings ++ " does not exist. Creating..."
postResult <- Client.postSite settings
case postResult of
Right (site:_) -> do
putStrLn $ "Successfully created " ++ site_name settings ++ ". " ++ show site
return site
Right [] -> do
putStrLn "Did not get new site id back from postgrest"
exitFailure
Left err -> do
putStrLn $ "Failed to create " ++ site_name settings
++ " Error: " ++ show err
exitFailure
Left err -> do
putStrLn $ "Error fetching sites: " ++ show err
exitFailure
createArchivesForNewBoards
:: JSONSettings
-> Set String
-> [ String ]
-> Int
-> IO [ Boards.Board ]
createArchivesForNewBoards settings dirsSet archived_boards siteid = do
let archivedBoardsSet = Set.fromList archived_boards
-- Find boards that are in dirs but not in archived_boards
let boardsToArchive = dirsSet `Set.difference` archivedBoardsSet
putStrLn "Creating boards:"
mapM_ putStrLn boardsToArchive
post_result <- Client.postBoards settings (Set.toList boardsToArchive) siteid
case post_result of
Left err -> do
putStrLn $ "Error posting boards: " ++ show err
exitFailure
Right boards -> do
putStrLn "Created the following boards:"
mapM_ (putStrLn . Boards.pathpart) boards
return boards
apiThreadToArchiveThread :: Int -> Thread -> Threads.Thread
apiThreadToArchiveThread board_id_ json_thread =
Threads.Thread
{ Threads.thread_id = undefined
, Threads.board_thread_id = no json_thread
, Threads.creation_time = epochToUTCTime $ fromIntegral (time json_thread)
, Threads.board_id = board_id_
}
epochToUTCTime :: Int -> UTCTime
epochToUTCTime = posixSecondsToUTCTime . realToFrac
createArchivesForNewThreads
:: JSONSettings
-> [ Thread ]
-> [ Threads.Thread ]
-> Boards.Board
-> IO [ Threads.Thread ]
createArchivesForNewThreads settings all_threads archived_threads board = do
putStrLn $ "Creating " ++ show (length threads_to_create) ++ " threads."
threads_result <- Client.postThreads settings (map (apiThreadToArchiveThread board_id) threads_to_create)
case threads_result of
Left err -> do
putStrLn $ "Error creating threads: " ++ show err
exitFailure
Right new_threads -> return new_threads
where
board_id :: Int = Boards.board_id board
archived_board_thread_ids :: Set.Set Int
archived_board_thread_ids =
Set.fromList $ map Threads.board_thread_id archived_threads
threads_to_create :: [ Thread ]
threads_to_create =
filter
((`Set.notMember` archived_board_thread_ids) . no)
all_threads
ensureThreads :: JSONSettings -> Boards.Board -> [ Thread ] -> IO [ Threads.Thread ]
ensureThreads settings board all_threads = do
threads_result <- Client.getThreads settings (Boards.board_id board) (map no all_threads)
case threads_result of
Left err -> do
putStrLn $ "Error fetching threads: " ++ show err
exitFailure
Right archived_threads -> do
putStrLn $ show (length archived_threads) ++ " threads already exist."
new_threads <- createArchivesForNewThreads settings all_threads archived_threads board
return $ archived_threads ++ new_threads
readPosts
:: JSONSettings
-> Boards.Board
-> Threads.Thread
-> IO (Threads.Thread, [ JSONPosts.Post ])
readPosts settings board thread = do
result <- parsePosts thread_filename
case result of
Left err -> do
putStrLn $ "Failed to parse the JSON file " ++ thread_filename ++ " error: " ++ err
return (thread, [])
Right posts_wrapper -> return (thread, JSONPosts.posts posts_wrapper)
where
thread_filename :: FilePath
thread_filename = backupDir </> "res" </> (show (Threads.board_thread_id thread) ++ ".json")
backupDir :: FilePath
backupDir = backup_read_root settings </> Boards.pathpart board
apiPostToPostKey :: Threads.Thread -> JSONPosts.Post -> Client.PostId
apiPostToPostKey thread post =
Client.PostId
{ Client.thread_id = (Threads.thread_id thread)
, Client.board_post_id = (JSONPosts.no post)
}
-- Convert Post to DbPost
apiPostToArchivePost :: Int -> Threads.Thread -> JSONPosts.Post -> Posts.Post
apiPostToArchivePost local_idx thread post =
Posts.Post
{ Posts.post_id = Nothing
, Posts.board_post_id = JSONPosts.no post
, Posts.creation_time = posixSecondsToUTCTime (realToFrac $ JSONPosts.time post)
, Posts.body = JSONPosts.com post
, Posts.name = JSONPosts.name post
, Posts.subject = JSONPosts.sub post
, Posts.email = JSONPosts.email post
, Posts.thread_id = Threads.thread_id thread
, Posts.embed = JSONPosts.embed post
, Posts.local_idx = local_idx
}
-- | A version of 'concatMap' that works with a monadic predicate.
-- Stolen from package extra Control.Monad.Extra
concatMapM :: Monad m => (a -> m [b]) -> [a] -> m [b]
{-# INLINE concatMapM #-}
concatMapM op = foldr f (pure [])
where f x xs = do
x_ <- op x
if null x_
then xs
else do
xs_ <- xs
pure $ x_ ++ xs_
addPostsToTuples
:: [(Sites.Site, Boards.Board, Threads.Thread, JSONPosts.Post)]
-> [ Posts.Post ]
-> [(Sites.Site, Boards.Board, Threads.Thread, JSONPosts.Post, Posts.Post)]
addPostsToTuples tuples posts = map f posts
where
post_map :: Map.Map (Int64, Int64) (Sites.Site, Boards.Board, Threads.Thread, JSONPosts.Post)
post_map = Map.fromList (map (\(a, b, c, d) -> ((Threads.thread_id c, JSONPosts.no d), (a, b, c, d))) tuples)
f :: Posts.Post -> (Sites.Site, Boards.Board, Threads.Thread, JSONPosts.Post, Posts.Post)
f new_post =
(\(a, b, c, d) -> (a, b, c, d, new_post))
(post_map Map.! (Posts.thread_id new_post, Posts.board_post_id new_post))
fileToAttachment :: Int -> Posts.Post -> JS.File -> At.Attachment
fileToAttachment i post file =
At.Attachment
{ At.mimetype = maybe guessed_mime id (JS.mime file)
, At.creation_time = Posts.creation_time post
, At.sha256_hash = undefined
, At.phash = Nothing
, At.illegal = False
, At.post_id = fromJust $ Posts.post_id post
, At.resolution = dim
, At.file_extension = Just extension
, At.thumb_extension = Just thumb_extension
, At.original_filename = Just $ JS.filename file <> "." <> extension
, At.file_size_bytes = JS.fsize file
, At.board_filename = JS.id file
, At.spoiler = maybe False id $ JS.spoiler file
, At.attachment_idx = i
}
where
extension = JS.ext file
thumb_extension = T.pack $ drop 1 $ takeExtension $ unpack $ JS.thumb_path file
guessed_mime = getMimeType extension
dim = (JS.w file) >>= \w ->
((JS.h file) >>= \h ->
Just $ At.Dimension w h)
getMimeType :: Text -> Text
getMimeType ext = decodeUtf8 $ defaultMimeLookup ext
phash_mimetypes :: Set.Set Text
phash_mimetypes = Set.fromList
[ "image/jpeg"
, "image/png"
, "image/gif"
]
copyFiles :: JSONSettings -> (Sites.Site, Boards.Board, Threads.Thread, Posts.Post, At.Paths, At.Attachment) -> IO ()
copyFiles settings (site, board, thread, _, path, attachment) = do
destination_exists <- doesFileExist dest
if not destination_exists
then do
src_exists <- doesFileExist src
createDirectoryIfMissing True common_dest
if src_exists
then putStrLn ("Copying " ++ src) >> copyFile src dest
else return ()
thumb_exists <- doesFileExist thumb_src
if thumb_exists
then putStrLn ("Copying " ++ thumb_src) >> copyFile thumb_src thumb_dest
else return ()
else return ()
-- src = (At.file_path | At.thumb_path)
-- dest = <media_root>/<website_name>/<boardpart>/<board_thread_id>/<sha>.<ext>
where
src :: FilePath
src = At.file_path path
thumb_src :: FilePath
thumb_src = At.thumbnail_path path
dest :: FilePath
dest = common_dest
</> (unpack $ At.board_filename attachment)
<.> (unpack $ fromJust $ At.file_extension attachment)
thumb_dest :: FilePath
thumb_dest = common_dest
</> "thumbnail_" <> (unpack $ At.board_filename attachment)
<.> (unpack $ fromJust $ At.thumb_extension attachment)
common_dest :: FilePath
common_dest
= (media_root_path settings)
</> Sites.name site
</> Boards.pathpart board
</> (show $ Threads.board_thread_id thread)
processFiles :: JSONSettings -> [(Sites.Site, Boards.Board, Threads.Thread, JSONPosts.Post, Posts.Post)] -> IO ()
processFiles settings tuples = do -- perfect just means that our posts have ids, they're already inserted into the db
let ps = map (\(_, _, _, _, x) -> x) tuples
existing_attachments_result <- Client.getAttachments settings (map (fromJust . Posts.post_id) ps)
case existing_attachments_result of
Left err -> do
putStrLn $ "Error fetching attachments: " ++ show err
exitFailure
Right existing_attachments -> do
let map_existing :: Map.Map (Int64, Text) [ At.Attachment ] =
foldl'
(insertRecord (\a -> (At.post_id a, At.board_filename a)))
Map.empty
existing_attachments
let attachments_on_board :: [(Sites.Site, Boards.Board, Threads.Thread, Posts.Post, At.Paths, At.Attachment)] =
concatMap parseAttachments tuples
-- attachments_on_board are the only files that can be copied into the archive dir right now
-- since that's where we have the src filename. except here the Attachment doesn't have a sha hash yet
-- so we can't build the destination filename.
let map_should_exist :: Map.Map (Int64, Text) [(Sites.Site, Boards.Board, Threads.Thread, Posts.Post, At.Paths, At.Attachment)] =
foldl'
(insertRecord (\(_, _, _, _, _, a) -> (At.post_id a, At.board_filename a)))
Map.empty
attachments_on_board
let to_insert_map =
Map.filterWithKey
(\k _ -> not $ k `Map.member` map_existing)
map_should_exist
let to_insert = foldr (++) [] $ Map.elems to_insert_map
to_insert_exist <- filterM attachmentFileExists to_insert
with_hashes <- mapM computeAttachmentHash to_insert_exist
attachments_result <- Client.postAttachments settings with_hashes
case attachments_result of
Left err -> do
putStrLn $ "Error posting attachments: " ++ show err
exitFailure
Right saved -> do
putStrLn $ "Saved " ++ (show $ length saved) ++ " attachments!"
mapM_ (copyFiles settings) attachments_on_board
where
attachmentFileExists :: (Sites.Site, Boards.Board, Threads.Thread, Posts.Post, At.Paths, At.Attachment) -> IO Bool
attachmentFileExists (_, _, _, _, p, _) = doesFileExist (At.file_path p)
computeAttachmentHash :: (Sites.Site, Boards.Board, Threads.Thread, Posts.Post, At.Paths, At.Attachment) -> IO At.Attachment
computeAttachmentHash (_, _, _, _, p, q) = do
let f = At.file_path p
putStrLn $ "Reading " ++ f
-- putStrLn $ show p
-- putStrLn $ show (q { At.sha256_hash = "undefined" })
sha256_sum <- Hash.computeSHA256 f
putStrLn $ "SHA-256: " ++ unpack sha256_sum
phash :: Maybe Int64 <-
case (At.mimetype q) `Set.member` phash_mimetypes of
True -> do
either_phash <- fileHash f
case either_phash of
Left err_str -> do
putStrLn $ "Failed to compute phash for file " ++ (unpack sha256_sum) ++ " " ++ f ++ " " ++ err_str
return Nothing
Right phash_w -> do
let phash_i = Words.wordToSignedInt64 phash_w
if phash_i == 0 then do
putStrLn $ "phash is 0 for file " ++ (unpack sha256_sum) ++ " " ++ f
return Nothing
else do
putStrLn $ "phash: " ++ show phash_w
return $ Just $ Words.wordToSignedInt64 phash_w
False -> return Nothing
return q
{ At.sha256_hash = sha256_sum
, At.phash = phash
}
parseLegacyPaths :: JSONPosts.Post -> Maybe (At.Paths, At.Attachment)
parseLegacyPaths post = do
tim <- JSONPosts.tim post
ext <- JSONPosts.ext post
filename <- JSONPosts.filename post
size <- JSONPosts.fsize post
spoiler <- JSONPosts.fsize post
let
board = JSONPosts.board post
file_path = withPathPrefix $ board <> "/src/" <> tim <> ext
thumbnail_path = withPathPrefix $ board <> "/thumb/" <> tim <> ext
p = At.Paths file_path thumbnail_path
mime = getMimeType ext
attachment = At.Attachment
{ At.mimetype = mime
, At.creation_time = undefined
, At.sha256_hash = undefined
, At.phash = Nothing
, At.illegal = False
, At.post_id = undefined
, At.resolution = undefined
, At.file_extension = Just $ T.drop 1 ext
, At.thumb_extension = Just $ "png"
, At.original_filename = Just $ filename <> ext
, At.file_size_bytes = size
, At.board_filename = tim
, At.spoiler = spoiler > 0
, At.attachment_idx = 1
}
return (p, attachment)
notDeleted :: (a, b, c, d, At.Paths, At.Attachment) -> Bool
notDeleted (_, _, _, _, p, _) = not $ "deleted" `isSuffixOf` (At.file_path p)
withPathPrefix :: Text -> FilePath
withPathPrefix = ((<>) $ backup_read_root settings) . unpack
parseAttachments
:: (Sites.Site, Boards.Board, Threads.Thread, JSONPosts.Post, Posts.Post)
-> [(Sites.Site, Boards.Board, Threads.Thread, Posts.Post, At.Paths, At.Attachment)]
parseAttachments (site, board, thread, p, q) = filter notDeleted $
case JSONPosts.files p of
Just files -> map
(\(i, x) ->
( site
, board
, thread
, q
, At.Paths (withPathPrefix $ JS.file_path x) (withPathPrefix $ JS.thumb_path x)
, fileToAttachment i q x
)
) (zip [1..] files)
Nothing ->
case parseLegacyPaths p of
Nothing -> []
Just (paths, a) ->
let
dim = (JSONPosts.w p) >>= \w -> ((JSONPosts.h p) >>= \h -> Just $ At.Dimension w h)
in
( site
, board
, thread
, q
, paths
, a
{ At.creation_time = Posts.creation_time q
, At.resolution = dim
, At.post_id = fromJust $ Posts.post_id q
}
) : []
insertRecord
:: Ord a
=> (b -> a)
-> Map.Map a [b]
-> b
-> Map.Map a [b]
insertRecord getKey accMap x =
let pid = getKey x
l = Map.findWithDefault [] pid accMap
in Map.insert pid (x : l) accMap
createNewPosts
:: JSONSettings
-> [ (Threads.Thread, JSONPosts.Post, Client.PostId) ]
-> IO [ Posts.Post ]
createNewPosts settings tuples = do
existing_post_results <- Client.getPosts settings $ map (\(_, _, c) -> c) tuples
existing_posts <- either handleError return existing_post_results
thread_max_local_idx_result <- Client.getThreadMaxLocalIdx settings thread_ids
thread_max_local_idxs <- either handleError return thread_max_local_idx_result
let existing_set :: Set (Int64, Int64) = Set.fromList (map (\x -> (Posts.thread_id x, Posts.board_post_id x)) existing_posts)
let to_insert_list :: [ (Threads.Thread, JSONPosts.Post, Client.PostId) ] =
sortBy (comparing $ \(_, _, p) -> Client.board_post_id p) $
newPosts tuples existing_set
-- Map of thread_id to the largest local_idx value (which would be the number of the last post in the thread)
let local_idx :: Map.Map Int64 Int = Map.fromList thread_max_local_idxs
let insert_posts :: [ Posts.Post ] = fst $ foldl' foldFn ([], local_idx) to_insert_list
-- posts to insert are the posts that are not in existing_posts
-- so we create a Set (thread_id, board_post_id) ✓
-- then check every tuples against the set and the ones not in the set get added to a to_insert_list ✓
-- also for every tuples we need to compute a local_idx
-- so we create a Map index_map from thread_id to local_idx ✓
-- - for existing_posts
-- - need to compare posts already in the map with another post and keep the max local_idx ✓
-- to get the new local_idx, we must order the to_insert_list by board_post_id, and look up each entry ✓
print insert_posts
posts_result <- Client.postPosts settings insert_posts
new_posts <- either handleError return posts_result
return $ existing_posts ++ new_posts
where
handleError err = print err >> exitFailure
thread_ids :: [ Int64 ]
thread_ids = Set.elems $ Set.fromList $ map (\(t, _, _) -> Threads.thread_id t) tuples
newPosts :: [(Threads.Thread, JSONPosts.Post, Client.PostId)] -> Set (Int64, Int64) -> [(Threads.Thread, JSONPosts.Post, Client.PostId)]
newPosts ts existing_set = filter (\(_, _, c) -> Set.notMember (Client.thread_id c, Client.board_post_id c) existing_set) ts
foldFn
:: ([Posts.Post], Map.Map Int64 Int)
-> (Threads.Thread, JSONPosts.Post, Client.PostId)
-> ([Posts.Post], Map.Map Int64 Int)
foldFn (posts, idx_map) (t, p, c) =
case Map.lookup thread_id idx_map of
Nothing -> (post 1 : posts, Map.insert thread_id 1 idx_map)
Just i -> (post (i + 1) : posts, Map.insert thread_id (i + 1) idx_map)
where
post :: Int -> Posts.Post
post i = apiPostToArchivePost i t p
thread_id = Client.thread_id c
processBoard :: JSONSettings -> Sites.Site -> Boards.Board -> IO ()
processBoard settings site board = do
let catalogPath = backupDir </> "catalog.json"
putStrLn $ "catalog file path: " ++ catalogPath
result <- parseJSONCatalog catalogPath
case result of
Right catalogs -> do
let threads_on_board = concatMap ((maybe [] id) . threads) catalogs
all_threads_for_board :: [ Threads.Thread ] <- ensureThreads settings board threads_on_board
all_posts_on_board :: [(Threads.Thread, [ JSONPosts.Post ])] <- mapM (readPosts settings board) all_threads_for_board
let tuples :: [(Sites.Site, Boards.Board, Threads.Thread, JSONPosts.Post)] = concatMap
(\(t, posts) -> map (\p -> (site, board, t, p)) posts)
all_posts_on_board
posts_result :: [ Posts.Post ] <- createNewPosts settings (map (\(_, _, c, d) -> (c, d, apiPostToPostKey c d)) tuples)
putStrLn "Sum of post_ids:"
print $ sum $ map (fromJust . Posts.post_id) posts_result
putStrLn "Sum of board_post_ids:"
print $ sum $ map Posts.board_post_id posts_result
let perfect_post_pairs = addPostsToTuples tuples posts_result
processFiles settings perfect_post_pairs
Left errMsg ->
putStrLn $ "Failed to parse the JSON file in directory: "
++ (Boards.pathpart board) ++ ". Error: " ++ errMsg
where
backupDir :: FilePath
backupDir = backup_read_root settings </> (Boards.pathpart board)
processBackupDirectory :: JSONSettings -> IO ()
processBackupDirectory settings = do
putStrLn "JSON successfully read!"
print settings -- print the decoded JSON settings
site :: Sites.Site <- ensureSiteExists settings
dirs <- listCatalogDirectories settings
let dirsSet = Set.fromList dirs
let site_id_ = Sites.site_id site
boards_result <- Client.getSiteBoards settings site_id_
putStrLn "Boards fetched!"
case boards_result of
Left err -> do
putStrLn $ "Error fetching boards: " ++ show err
exitFailure
Right archived_boards -> do
let boardnames = map Boards.pathpart archived_boards
created_boards <- createArchivesForNewBoards settings dirsSet boardnames site_id_
let boards :: [ Boards.Board ] = archived_boards ++ created_boards
let boards_we_have_data_for = filter (\board -> Set.member (Boards.pathpart board) dirsSet) boards
mapM_ (processBoard settings site) boards_we_have_data_for
-- TODO: detect saged threads by reading the bump time from the thread and comparing
-- that time to the timestamp of the most recent post. If the post is newer
-- - then the thread is being saged. Reasons it can be saged:
-- - it's saged by a mod
-- - the post has sage in the email field
-- - the thread is full.
--
-- Better to support all those flags via the api: saged, locked, cyclical?, sticky
-- - deleted could be there too
-- - edited could be there too
import Lib
main :: IO ()
main = do

16
src/BoardQueueElem.hs Normal file
View File

@ -0,0 +1,16 @@
module BoardQueueElem where
import Data.Time.Clock (UTCTime)
import SitesType (Site)
import BoardsType (Board)
data BoardQueueElem = BoardQueueElem
{ site :: Site
, board :: Board
, last_modified :: UTCTime
} deriving (Show, Eq)
instance Ord BoardQueueElem where
(<=) :: BoardQueueElem -> BoardQueueElem -> Bool
a <= b = last_modified a >= last_modified b

View File

@ -12,5 +12,5 @@ data Board = Board
, name :: Maybe String
, pathpart :: String
, site_id :: Int
} deriving (Show, Generic, FromJSON)
} deriving (Show, Eq, Generic, FromJSON)

@ -1 +1 @@
Subproject commit 62a23581e786f8564653406845c4b2a07d73deb6
Subproject commit 88b5f0df7ea5e83a65a6c6153f197da7cd1c6217

View File

@ -32,7 +32,7 @@ data Thread = Thread
, locked :: Maybe Int
, cyclical :: Maybe J.Cyclical
, last_modified :: Int
, board :: Text
-- , board :: Text
, files :: Maybe [J.File]
, resto :: Int
, unique_ips :: Maybe Int

View File

@ -23,7 +23,7 @@ data Post = Post
, cyclical :: Maybe J.Cyclical
, last_modified :: Int
, embed :: Maybe Text
, board :: Text
-- , board :: Text
, files :: Maybe [J.File]
, resto :: Int
, unique_ips :: Maybe Int

777
src/Lib.hs Normal file
View File

@ -0,0 +1,777 @@
{-# OPTIONS_GHC -Wno-unrecognised-pragmas #-}
{-# HLINT ignore "Redundant bracket" #-}
{-# HLINT ignore "Use fromMaybe" #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RecordWildCards #-}
module Lib where
import System.Exit
import Data.Int (Int64)
import Control.Monad (filterM)
import System.Console.CmdArgs hiding (name)
import System.Directory
( listDirectory
, doesFileExist
, copyFile
, createDirectoryIfMissing
, removeFile
)
import System.FilePath ((</>), (<.>), takeExtension)
import Data.List (find, isSuffixOf, foldl', sortBy)
import Data.Ord (comparing)
import qualified Data.Set as Set
import Data.Set (Set)
import Data.Time.Clock.POSIX (posixSecondsToUTCTime)
import Data.Time.Clock (UTCTime)
import qualified Data.Map as Map
import Data.Maybe (fromJust, catMaybes)
import Data.Text (Text, unpack)
import qualified Data.Text as T
import Data.Text.Encoding (decodeUtf8)
import Network.Mime (defaultMimeLookup)
import PerceptualHash (fileHash)
import Control.Exception.Safe (tryAny, tryAsync, SomeException, displayException)
import qualified Data.ByteString.Lazy as B
import Data.Aeson (FromJSON)
import JSONParsing
import qualified JSONCommonTypes as JS
import qualified JSONPost as JSONPosts
import qualified Network.DataClient as Client
import qualified SitesType as Sites
import qualified BoardsType as Boards
import qualified ThreadType as Threads
import qualified Common.AttachmentType as At
import qualified Common.PostsType as Posts
import qualified Hash
import qualified Data.WordUtil as Words
import Common.Server.JSONSettings as J
import Common.Network.HttpClient (HttpError)
import qualified Common.Server.ConsumerSettings as CS
newtype SettingsCLI = SettingsCLI
{ jsonFile :: FilePath
} deriving (Show, Data, Typeable)
-- Move a file by reading, writing, and then deleting the original
moveFile :: FilePath -> FilePath -> IO ()
moveFile src dst =
B.readFile src >>= B.writeFile dst >> removeFile src
listCatalogDirectories :: JSONSettings -> IO [ FilePath ]
listCatalogDirectories settings = do
allDirs <- listDirectory (backup_read_root settings)
let filteredDirs = filter (`notElem` excludedDirs) allDirs
filterM hasCatalog filteredDirs
where
excludedDirs = ["sfw", "alt", "overboard"]
hasCatalog dir = do
let catalogPath = backup_read_root settings </> dir </> "catalog.json"
doesFileExist catalogPath
ensureSiteExists :: JSONSettings -> Either HttpError [ Sites.Site ] -> IO Sites.Site
ensureSiteExists settings sitesResult = do
case sitesResult of
Right siteList ->
case find (\site -> Sites.name site == site_name settings) siteList of
Just site -> do
putStrLn $ site_name settings ++ " already exists!"
return site
Nothing -> do
putStrLn $ site_name settings ++ " does not exist. Creating..."
postResult <- Client.postSite settings
case postResult of
Right (site:_) -> do
putStrLn $ "Successfully created " ++ site_name settings ++ ". " ++ show site
return site
Right [] -> do
putStrLn "Did not get new site id back from postgrest"
exitFailure
Left err -> do
putStrLn $ "Failed to create " ++ site_name settings
++ " Error: " ++ show err
exitFailure
Left err -> do
putStrLn $ "Error fetching sites: " ++ show err
exitFailure
createArchivesForNewBoards
:: JSONSettings
-> Set String
-> [ String ]
-> Int
-> IO [ Boards.Board ]
createArchivesForNewBoards settings dirsSet archived_boards siteid = do
let archivedBoardsSet = Set.fromList archived_boards
-- Find boards that are in dirs but not in archived_boards
let boardsToArchive = dirsSet `Set.difference` archivedBoardsSet
putStrLn $ "Creating " ++ (show $ length boardsToArchive) ++ " boards:"
mapM_ putStrLn boardsToArchive
if Set.null boardsToArchive
then return []
else do
post_result <- Client.postBoards settings (Set.toList boardsToArchive) siteid
case post_result of
Left err -> do
putStrLn $ "Error posting boards: " ++ show err
exitFailure
Right boards -> do
putStrLn "Created the following boards:"
mapM_ (putStrLn . Boards.pathpart) boards
return boards
apiThreadToArchiveThread :: Int -> Thread -> Threads.Thread
apiThreadToArchiveThread board_id_ json_thread =
Threads.Thread
{ Threads.thread_id = undefined
, Threads.board_thread_id = no json_thread
, Threads.creation_time = epochToUTCTime $ fromIntegral (time json_thread)
, Threads.board_id = board_id_
}
epochToUTCTime :: Int -> UTCTime
epochToUTCTime = posixSecondsToUTCTime . realToFrac
createArchivesForNewThreads
:: JSONSettings
-> [ Thread ]
-> [ Threads.Thread ]
-> Boards.Board
-> IO [ Threads.Thread ]
createArchivesForNewThreads settings all_threads archived_threads board = do
putStrLn $ "Creating " ++ show (length threads_to_create) ++ " threads."
threads_result <- Client.postThreads settings (map (apiThreadToArchiveThread board_id) threads_to_create)
case threads_result of
Left err -> do
putStrLn $ "Error creating threads: " ++ show err
exitFailure
Right new_threads -> return new_threads
where
board_id :: Int = Boards.board_id board
archived_board_thread_ids :: Set.Set Int
archived_board_thread_ids =
Set.fromList $ map Threads.board_thread_id archived_threads
threads_to_create :: [ Thread ]
threads_to_create =
filter
((`Set.notMember` archived_board_thread_ids) . no)
all_threads
ensureThreads :: JSONSettings -> Boards.Board -> [ Thread ] -> IO [ Threads.Thread ]
ensureThreads settings board all_threads = do
threads_result <- Client.getThreads settings (Boards.board_id board) (map no all_threads)
case threads_result of
Left err -> do
putStrLn $ "Error fetching threads: " ++ show err
exitFailure
Right archived_threads -> do
putStrLn $ show (length archived_threads) ++ " threads already exist."
new_threads <- createArchivesForNewThreads settings all_threads archived_threads board
return $ archived_threads ++ new_threads
readPosts
:: FileGetters
-> Sites.Site
-> Boards.Board
-> Threads.Thread
-> IO (Threads.Thread, [ JSONPosts.Post ])
readPosts FileGetters {..} site board thread = do
result <- getJSONPosts site relative_path
case result of
Left err -> do
putStrLn $ "Failed to parse the JSON file " ++ relative_path ++ " error: " ++ err
putStrLn $ "Site: " ++ show site
return (thread, [])
Right posts_wrapper -> return (thread, JSONPosts.posts posts_wrapper)
where
relative_path :: FilePath
relative_path = Boards.pathpart board </> "res" </> (show (Threads.board_thread_id thread) ++ ".json")
apiPostToPostKey :: Threads.Thread -> JSONPosts.Post -> Client.PostId
apiPostToPostKey thread post =
Client.PostId
{ Client.thread_id = (Threads.thread_id thread)
, Client.board_post_id = (JSONPosts.no post)
}
-- Convert Post to DbPost
apiPostToArchivePost :: Int -> Threads.Thread -> JSONPosts.Post -> Posts.Post
apiPostToArchivePost local_idx thread post =
Posts.Post
{ Posts.post_id = Nothing
, Posts.board_post_id = JSONPosts.no post
, Posts.creation_time = posixSecondsToUTCTime (realToFrac $ JSONPosts.time post)
, Posts.body = JSONPosts.com post
, Posts.name = JSONPosts.name post
, Posts.subject = JSONPosts.sub post
, Posts.email = JSONPosts.email post
, Posts.thread_id = Threads.thread_id thread
, Posts.embed = JSONPosts.embed post
, Posts.local_idx = local_idx
}
-- | A version of 'concatMap' that works with a monadic predicate.
-- Stolen from package extra Control.Monad.Extra
concatMapM :: Monad m => (a -> m [b]) -> [a] -> m [b]
{-# INLINE concatMapM #-}
concatMapM op = foldr f (pure [])
where f x xs = do
x_ <- op x
if null x_
then xs
else do
xs_ <- xs
pure $ x_ ++ xs_
addPostsToTuples
:: [(Sites.Site, Boards.Board, Threads.Thread, JSONPosts.Post)]
-> [ Posts.Post ]
-> [(Sites.Site, Boards.Board, Threads.Thread, JSONPosts.Post, Posts.Post)]
addPostsToTuples tuples posts = map f posts
where
post_map :: Map.Map (Int64, Int64) (Sites.Site, Boards.Board, Threads.Thread, JSONPosts.Post)
post_map = Map.fromList (map (\(a, b, c, d) -> ((Threads.thread_id c, JSONPosts.no d), (a, b, c, d))) tuples)
f :: Posts.Post -> (Sites.Site, Boards.Board, Threads.Thread, JSONPosts.Post, Posts.Post)
f new_post =
(\(a, b, c, d) -> (a, b, c, d, new_post))
(post_map Map.! (Posts.thread_id new_post, Posts.board_post_id new_post))
fileToAttachment :: Int -> Posts.Post -> JS.File -> At.Attachment
fileToAttachment i post file =
At.Attachment
{ At.mimetype = maybe guessed_mime id (JS.mime file)
, At.creation_time = Posts.creation_time post
, At.sha256_hash = undefined
, At.phash = Nothing
, At.illegal = False
, At.post_id = fromJust $ Posts.post_id post
, At.resolution = dim
, At.file_extension = Just extension
, At.thumb_extension = Just thumb_extension
, At.original_filename = Just $ JS.filename file <> "." <> extension
, At.file_size_bytes = JS.fsize file
, At.board_filename = JS.id file
, At.spoiler = maybe False id $ JS.spoiler file
, At.attachment_idx = i
}
where
extension = T.filter (/= '.') $ JS.ext file
thumb_extension = T.pack $ drop 1 $ takeExtension $ unpack $ JS.thumb_path file
guessed_mime = getMimeType extension
dim = (JS.w file) >>= \w ->
((JS.h file) >>= \h ->
Just $ At.Dimension w h)
getMimeType :: Text -> Text
getMimeType ext = decodeUtf8 $ defaultMimeLookup ext
phash_mimetypes :: Set.Set Text
phash_mimetypes = Set.fromList
[ "image/jpeg"
, "image/png"
, "image/gif"
]
copyOrMoveFiles :: JSONSettings -> FileGetters -> Details -> IO ()
copyOrMoveFiles settings fgs (site, board, thread, _, path, attachment) = do
(copyOrMove fgs) common_dest (src, dest) (thumb_src, thumb_dest)
-- src = (At.file_path | At.thumb_path)
-- dest = <media_root>/<website_name>/<boardpart>/<board_thread_id>/<sha>.<ext>
where
src :: FilePath
src = At.file_path path
thumb_src :: Maybe FilePath
thumb_src = At.thumbnail_path path
dest :: FilePath
dest = common_dest
</> (unpack $ At.board_filename attachment)
<.> (unpack $ fromJust $ At.file_extension attachment)
thumb_dest :: FilePath
thumb_dest = common_dest
</> "thumbnail_" <> (unpack $ At.board_filename attachment)
<.> (unpack $ fromJust $ At.thumb_extension attachment)
common_dest :: FilePath
common_dest
= (media_root_path settings)
</> Sites.name site
</> Boards.pathpart board
</> (show $ Threads.board_thread_id thread)
type Details = (Sites.Site, Boards.Board, Threads.Thread, Posts.Post, At.Paths, At.Attachment)
processFiles
:: JSONSettings
-> FileGetters
-> [(Sites.Site, Boards.Board, Threads.Thread, JSONPosts.Post, Posts.Post)]
-> IO ()
processFiles settings fgs tuples = do -- perfect just means that our posts have ids, they're already inserted into the db
let ps = map (\(_, _, _, _, x) -> x) tuples
existing_attachments_result <- Client.getAttachments settings (map (fromJust . Posts.post_id) ps)
case existing_attachments_result of
Left err -> do
putStrLn $ "Error fetching attachments: " ++ show err
exitFailure
Right existing_attachments -> do
let map_existing :: Map.Map (Int64, Text) [ At.Attachment ] =
foldl'
(insertRecord (\a -> (At.post_id a, At.board_filename a)))
Map.empty
existing_attachments
let attachments_on_board :: [ Details ] =
concatMap parseAttachments tuples
-- attachments_on_board are the only files that can be copied into the archive dir right now
-- since that's where we have the src filename. except here the Attachment doesn't have a sha hash yet
-- so we can't build the destination filename.
let map_should_exist :: Map.Map (Int64, Text) [ Details ] =
foldl'
(insertRecord (\(_, _, _, _, _, a) -> (At.post_id a, At.board_filename a)))
Map.empty
attachments_on_board
let to_insert_map =
Map.filterWithKey
(\k _ -> not $ k `Map.member` map_existing)
map_should_exist
let to_insert = foldr (++) [] $ Map.elems to_insert_map
to_insert_ <- mapM ensureAttachmentExists to_insert
let to_insert_exist = catMaybes to_insert_
with_hashes <- mapM computeAttachmentHash to_insert_exist
attachments_result <- Client.postAttachments settings with_hashes
case attachments_result of
Left err -> do
putStrLn $ "Error posting attachments: " ++ show err
exitFailure
Right saved -> do
putStrLn $ "Saved " ++ (show $ length saved) ++ " attachments!"
mapM_ (copyOrMoveFiles settings fgs) to_insert_exist
where
ensureAttachmentExists :: Details -> IO (Maybe Details)
ensureAttachmentExists (a, b, c, d, p, f) =
(attachmentPaths fgs) p >>=
return . (maybe Nothing (\x -> Just (a, b, c, d, x, f)))
computeAttachmentHash :: Details -> IO At.Attachment
computeAttachmentHash (_, _, _, _, p, q) = do
let f = At.file_path p
putStrLn $ "Reading " ++ f
sha256_sum <- Hash.computeSHA256 f
putStrLn $ "SHA-256: " ++ unpack sha256_sum
phash :: Maybe Int64 <-
case (At.mimetype q) `Set.member` phash_mimetypes of
True -> do
putStrLn $ "Running tryAny $ fileHash f " ++ f
either_exception <- tryAny $ fileHash f
putStrLn $ "Done tryAny $ fileHash f " ++ f
case either_exception of
Left (err :: SomeException) -> do
putStrLn $ "Error while computing the perceptual hash of file " ++ f ++ " " ++ displayException err
return Nothing
Right either_phash ->
case either_phash of
Left err_str -> do
putStrLn $ "Failed to compute phash for file " ++ (unpack sha256_sum) ++ " " ++ f ++ " " ++ err_str
return Nothing
Right phash_w -> do
result <- tryAsync $ do
let phash_i = Words.wordToSignedInt64 phash_w
if phash_i == 0 then do
putStrLn $ "phash is 0 for file " ++ (unpack sha256_sum) ++ " " ++ f
return Nothing
else do
putStrLn $ "phash: " ++ show phash_w
return $ Just $ Words.wordToSignedInt64 phash_w
case result of
Left (err2 :: SomeException) -> do
putStrLn $ "Error handling phash result! " ++ displayException err2
return Nothing
Right w -> return w
False -> return Nothing
return q
{ At.sha256_hash = sha256_sum
, At.phash = phash
}
parseLegacyPaths :: Boards.Board -> JSONPosts.Post -> Maybe (At.Paths, At.Attachment)
parseLegacyPaths board post = do
tim <- JSONPosts.tim post
ext <- JSONPosts.ext post
filename <- JSONPosts.filename post
size <- JSONPosts.fsize post
spoiler <- JSONPosts.fsize post
let
board_pathpart = T.pack $ Boards.pathpart board
file_path = (withPathPrefix "") </> (T.unpack $ board_pathpart <> "/src/" <> tim <> ext)
thumb_extension = "png"
thumbnail_path = (withPathPrefix "") </> (T.unpack $ board_pathpart <> "/thumb/" <> tim <> "." <> thumb_extension)
p = At.Paths file_path (Just thumbnail_path)
mime = getMimeType ext
attachment = At.Attachment
{ At.mimetype = mime
, At.creation_time = undefined
, At.sha256_hash = undefined
, At.phash = Nothing
, At.illegal = False
, At.post_id = undefined
, At.resolution = undefined
, At.file_extension = Just $ T.drop 1 ext
, At.thumb_extension = Just $ thumb_extension
, At.original_filename = Just $ filename <> ext
, At.file_size_bytes = size
, At.board_filename = tim
, At.spoiler = spoiler > 0
, At.attachment_idx = 1
}
return (p, attachment)
notDeleted :: (a, b, c, d, At.Paths, At.Attachment) -> Bool
notDeleted (_, _, _, _, p, _) = not $ "deleted" `isSuffixOf` (At.file_path p)
withPathPrefix :: Text -> FilePath
withPathPrefix = (addPathPrefix fgs) . unpack
parseAttachments
:: (Sites.Site, Boards.Board, Threads.Thread, JSONPosts.Post, Posts.Post)
-> [ Details ]
parseAttachments (site, board, thread, p, q) = filter notDeleted $
case JSONPosts.files p of
Just files -> map
(\(i, x) ->
( site
, board
, thread
, q
, At.Paths (withPathPrefix $ JS.file_path x) (Just $ withPathPrefix $ JS.thumb_path x)
, fileToAttachment i q x
)
) (zip [1..] files)
Nothing ->
case parseLegacyPaths board p of
Nothing -> []
Just (paths, a) ->
let
dim = (JSONPosts.w p) >>= \w -> ((JSONPosts.h p) >>= \h -> Just $ At.Dimension w h)
in
( site
, board
, thread
, q
, paths
, a
{ At.creation_time = Posts.creation_time q
, At.resolution = dim
, At.post_id = fromJust $ Posts.post_id q
}
) : []
insertRecord
:: Ord a
=> (b -> a)
-> Map.Map a [b]
-> b
-> Map.Map a [b]
insertRecord getKey accMap x =
let pid = getKey x
l = Map.findWithDefault [] pid accMap
in Map.insert pid (x : l) accMap
createNewPosts
:: JSONSettings
-> [ (Threads.Thread, JSONPosts.Post, Client.PostId) ]
-> IO [ Posts.Post ]
createNewPosts settings tuples = do
existing_post_results <- Client.getPosts settings $ map (\(_, _, c) -> c) tuples
existing_posts <- either handleError return existing_post_results
thread_max_local_idx_result <- Client.getThreadMaxLocalIdx settings thread_ids
thread_max_local_idxs <- either handleError return thread_max_local_idx_result
let existing_set :: Set (Int64, Int64) = Set.fromList (map (\x -> (Posts.thread_id x, Posts.board_post_id x)) existing_posts)
let to_insert_list :: [ (Threads.Thread, JSONPosts.Post, Client.PostId) ] =
sortBy (comparing $ \(_, _, p) -> Client.board_post_id p) $
newPosts tuples existing_set
-- Map of thread_id to the largest local_idx value (which would be the number of the last post in the thread)
let local_idx :: Map.Map Int64 Int = Map.fromList thread_max_local_idxs
let insert_posts :: [ Posts.Post ] = fst $ foldl' foldFn ([], local_idx) to_insert_list
-- posts to insert are the posts that are not in existing_posts
-- so we create a Set (thread_id, board_post_id) ✓
-- then check every tuples against the set and the ones not in the set get added to a to_insert_list ✓
-- also for every tuples we need to compute a local_idx
-- so we create a Map index_map from thread_id to local_idx ✓
-- - for existing_posts
-- - need to compare posts already in the map with another post and keep the max local_idx ✓
-- to get the new local_idx, we must order the to_insert_list by board_post_id, and look up each entry ✓
print insert_posts
posts_result <- Client.postPosts settings insert_posts
new_posts <- either handleError return posts_result
return $ existing_posts ++ new_posts
where
handleError err = print err >> exitFailure
thread_ids :: [ Int64 ]
thread_ids = Set.elems $ Set.fromList $ map (\(t, _, _) -> Threads.thread_id t) tuples
newPosts :: [(Threads.Thread, JSONPosts.Post, Client.PostId)] -> Set (Int64, Int64) -> [(Threads.Thread, JSONPosts.Post, Client.PostId)]
newPosts ts existing_set = filter (\(_, _, c) -> Set.notMember (Client.thread_id c, Client.board_post_id c) existing_set) ts
foldFn
:: ([Posts.Post], Map.Map Int64 Int)
-> (Threads.Thread, JSONPosts.Post, Client.PostId)
-> ([Posts.Post], Map.Map Int64 Int)
foldFn (posts, idx_map) (t, p, c) =
case Map.lookup thread_id idx_map of
Nothing -> (post 1 : posts, Map.insert thread_id 1 idx_map)
Just i -> (post (i + 1) : posts, Map.insert thread_id (i + 1) idx_map)
where
post :: Int -> Posts.Post
post i = apiPostToArchivePost i t p
thread_id = Client.thread_id c
data FileGetters = FileGetters
{ getJSONCatalog :: Sites.Site -> String -> IO (Either String [ Catalog ])
, getJSONPosts :: Sites.Site -> String -> IO (Either String JSONPosts.PostWrapper)
, addPathPrefix :: String -> String
, attachmentPaths :: At.Paths -> IO (Maybe At.Paths)
, copyOrMove :: String -> (String, String) -> (Maybe String, String) -> IO ()
}
localFileGetters :: JSONSettings -> FileGetters
localFileGetters settings = FileGetters
{ getJSONCatalog = const $ parseJSONCatalog . withRoot
, getJSONPosts = const $ parsePosts . withRoot
, addPathPrefix = ((++) $ backup_read_root settings)
, attachmentPaths = \p -> do
exists <- doesFileExist (At.file_path p)
if exists then return (Just p) else return Nothing
, copyOrMove = \common_dest (src, dest) (m_thumb_src, thumb_dest) -> do
destination_exists <- doesFileExist dest
if not destination_exists
then do
src_exists <- doesFileExist src
createDirectoryIfMissing True common_dest
if src_exists
then putStrLn ("Copying " ++ src) >> copyFile src dest
else return ()
case m_thumb_src of
Nothing -> return ()
Just thumb_src -> do
thumb_exists <- doesFileExist thumb_src
if thumb_exists
then putStrLn ("Copying " ++ thumb_src) >> copyFile thumb_src thumb_dest
else return ()
else return ()
}
where
withRoot = (backup_read_root settings </>)
processBoard :: JSONSettings -> FileGetters -> Sites.Site -> Boards.Board -> IO ()
processBoard settings fgs@FileGetters {..} site board = do
let catalogPath = Boards.pathpart board </> "catalog.json"
putStrLn $ "catalog file path: " ++ catalogPath
result <- getJSONCatalog site catalogPath
case result of
Right (catalogs :: [ Catalog ]) -> do
let threads_on_board = concatMap ((maybe [] id) . threads) catalogs
all_threads_for_board :: [ Threads.Thread ] <- ensureThreads settings board threads_on_board
all_posts_on_board :: [(Threads.Thread, [ JSONPosts.Post ])] <- mapM (readPosts fgs site board) all_threads_for_board
let tuples :: [(Sites.Site, Boards.Board, Threads.Thread, JSONPosts.Post)] = concatMap
(\(t, posts) -> map (\p -> (site, board, t, p)) posts)
all_posts_on_board
posts_result :: [ Posts.Post ] <- createNewPosts settings (map (\(_, _, c, d) -> (c, d, apiPostToPostKey c d)) tuples)
putStrLn "Sum of post_ids:"
print $ sum $ map (fromJust . Posts.post_id) posts_result
putStrLn "Sum of board_post_ids:"
print $ sum $ map Posts.board_post_id posts_result
let perfect_post_pairs = addPostsToTuples tuples posts_result
processFiles settings fgs perfect_post_pairs
Left errMsg ->
putStrLn $ "Failed to parse the JSON file in directory: "
++ (Boards.pathpart board) ++ ". Error: " ++ errMsg
getBoards :: JSONSettings -> [ FilePath ] -> IO (Sites.Site, [ Boards.Board ])
getBoards settings board_names = do
sitesResult <- Client.getAllSites settings
site :: Sites.Site <- ensureSiteExists settings sitesResult
let boardsSet = Set.fromList board_names
let site_id_ = Sites.site_id site
boards_result <- Client.getSiteBoards settings site_id_
putStrLn "Boards fetched!"
case boards_result of
Left err -> do
putStrLn $ "Error fetching boards: " ++ show err
exitFailure
Right archived_boards -> do
let boardnames = map Boards.pathpart archived_boards
created_boards <- createArchivesForNewBoards settings boardsSet boardnames site_id_
let boards :: [ Boards.Board ] = archived_boards ++ created_boards
let boards_we_have_data_for = filter (\board -> Set.member (Boards.pathpart board) boardsSet) boards
return (site, boards_we_have_data_for)
processBoards :: JSONSettings -> FileGetters -> [ FilePath ] -> IO ()
processBoards settings fgs board_names =
getBoards settings board_names >>= \(site, boards) ->
mapM_ (processBoard settings fgs site) boards
processBackupDirectory :: JSONSettings -> IO ()
processBackupDirectory settings = do
putStrLn "JSON successfully read!"
print settings -- print the decoded JSON settings
boards <- listCatalogDirectories settings
processBoards settings (localFileGetters settings) boards
toClientSettings :: CS.ConsumerJSONSettings -> CS.JSONSiteSettings -> J.JSONSettings
toClientSettings CS.ConsumerJSONSettings {..} CS.JSONSiteSettings {..} =
J.JSONSettings
{ J.postgrest_url = postgrest_url
, J.jwt = jwt
, J.backup_read_root = undefined
, J.media_root_path = media_root_path
, J.site_name = name
, J.site_url = root_url
}
httpGetJSON :: (FromJSON a) => Sites.Site -> String -> IO (Either String a)
httpGetJSON site path = (Client.getJSON $ Sites.url site </> path)
>>= getErrMsg
where
getErrMsg :: Either Client.HttpError a -> IO (Either String a)
getErrMsg (Left err) = return $ Left $ show err
getErrMsg (Right x) = return $ Right x
httpFileGetters :: J.JSONSettings -> FileGetters
httpFileGetters settings = FileGetters
{ getJSONCatalog = httpGetJSON
, getJSONPosts = httpGetJSON
, addPathPrefix = ((++) $ J.site_url settings)
-- attachmentPaths here actually doesn't get the paths of the attachment,
-- it downloads them into a temporary file and gets that path of that.
, attachmentPaths = \paths -> do
filepath <- Client.getFile (At.file_path paths)
m_thumbpath <- case At.thumbnail_path paths of
Nothing -> return Nothing
Just thumbpath -> Client.getFile thumbpath
return $ filepath >>= \fp ->
case m_thumbpath of
Nothing -> return (At.Paths fp Nothing)
tp -> return (At.Paths fp tp)
, copyOrMove = \common_dest (src, dest) (m_thumb_src, thumb_dest) -> do
putStrLn $ "Copy Or Move (Move) src: " ++ src ++ " dest: " ++ dest
createDirectoryIfMissing True common_dest
moveFile src dest
case m_thumb_src of
Nothing -> return ()
Just thumb_src -> moveFile thumb_src thumb_dest
}

69
src/Main.hs Normal file
View File

@ -0,0 +1,69 @@
{-# LANGUAGE RecordWildCards #-}
{-# OPTIONS_GHC -Wno-unrecognised-pragmas #-}
{-# HLINT ignore "Use when" #-}
module Main (main) where
import System.Exit (exitFailure)
import qualified Data.ByteString.Lazy as B
import System.Console.CmdArgs (cmdArgs, Data, Typeable)
import Data.Aeson (decode)
import Control.Concurrent.Async (mapConcurrently_)
import Common.Server.ConsumerSettings
import Lib
( processBoards
, toClientSettings
, httpFileGetters
)
import Sync
newtype CliArgs = CliArgs
{ settingsFile :: String
} deriving (Show, Data, Typeable)
getSettings :: IO ConsumerJSONSettings
getSettings = do
cliArgs <- cmdArgs $ CliArgs "consumer_settings.json"
let filePath = settingsFile cliArgs
if null filePath
then do
putStrLn "Error: No JSON settings file provided."
exitFailure
else do
putStrLn $ "Loading settings from: " ++ filePath
content <- B.readFile filePath
case decode content :: Maybe ConsumerJSONSettings of
Nothing -> do
putStrLn "Error: Invalid JSON format."
exitFailure
Just settings -> return settings
processWebsite :: ConsumerJSONSettings -> JSONSiteSettings -> IO ()
processWebsite settings site_settings = do
let client_settings = toClientSettings settings site_settings
processBoards client_settings (httpFileGetters client_settings) (boards site_settings)
return ()
main :: IO ()
main = do
settings <- getSettings
print settings
_ <- if http_fill_all settings
then do
putStrLn "Starting web backfill"
mapConcurrently_ (processWebsite settings) (websites settings)
putStrLn "Finished web backfill"
else return ()
if http_sync_continously settings
then do
putStrLn "Starting web sync loop"
syncWebsites settings
else return ()
putStrLn "Done. Quitting."

View File

@ -1,5 +1,7 @@
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE DeriveAnyClass #-}
{-# OPTIONS_GHC -Wno-unrecognised-pragmas #-}
{-# HLINT ignore "Use <&>" #-}
module Network.DataClient
( HttpError(..)
@ -17,6 +19,9 @@ module Network.DataClient
, postPosts
, getAttachments
, postAttachments
, getJSON
, getFile
, getLatestPostsPerBoard
) where
import Control.Monad (forM)
@ -36,6 +41,8 @@ import Data.Aeson
, Value
)
import GHC.Generics
import System.IO.Temp (openBinaryTempFile, getCanonicalTemporaryDirectory)
import System.IO (hClose)
import qualified Common.Server.JSONSettings as T
import qualified SitesType as Sites
@ -45,6 +52,7 @@ import qualified Common.AttachmentType as Attachments
import qualified Common.PostsType as Posts
import Common.Network.HttpClient
import qualified Network.DataClientTypes as T
import qualified Network.GetLatestPostsPerBoardResponse as GLPPBR
data PostId = PostId
@ -214,4 +222,34 @@ eitherDecodeResponse (Left err) = Left err
eitherDecodeResponse (Right bs) =
case eitherDecode bs of
Right val -> Right val
Left err -> Left $ StatusCodeError 500 $ LC8.pack $ "Failed to decode JSON: " ++ err ++ " " ++ (show bs)
Left err -> Left $ StatusCodeError 500 $ LC8.pack $ "Failed to decode JSON: " ++ err ++ " " ++ show bs
getJSON :: (FromJSON a) => String -> IO (Either HttpError a)
getJSON url = get_ url [] >>= return . eitherDecodeResponse
getFile :: String -> IO (Maybe String)
getFile url = do
putStrLn $ "getFile " ++ url
result <- get_ url []
case result of
Left (err :: HttpError) -> do
putStrLn $ "getFile " ++ url ++ " Error!"
print err
return Nothing
Right lbs -> do
putStrLn $ "getFile " ++ url ++ " SUCCESS!"
tmp_root <- getCanonicalTemporaryDirectory
(tmp_filepath, tmp_filehandle) <- openBinaryTempFile tmp_root "chan.attachment"
putStrLn $ "Created " ++ tmp_filepath
putStrLn "Writing attachment..."
LBS.hPut tmp_filehandle lbs
hClose tmp_filehandle
return $ Just tmp_filepath
getLatestPostsPerBoard :: T.JSONSettings -> IO (Either HttpError [ GLPPBR.GetLatestPostsPerBoardResponse ])
getLatestPostsPerBoard settings =
post settings "/rpc/get_latest_posts_per_board" mempty False >>= return . eitherDecodeResponse

View File

@ -10,4 +10,3 @@ data ThreadMaxIdx = ThreadMaxIdx
{ thread_id :: Int64
, max_idx :: Int
} deriving (Show, Generic, FromJSON)

View File

@ -0,0 +1,20 @@
{-# LANGUAGE DeriveAnyClass #-}
module Network.GetLatestPostsPerBoardResponse
where
import Data.Int (Int64)
import Data.Time.Clock (UTCTime)
import Data.Aeson (FromJSON)
import GHC.Generics
data GetLatestPostsPerBoardResponse = GetLatestPostsPerBoardResponse
{ board_id :: Int
, site_id :: Int
, pathpart :: String
, post_id :: Maybe Int64
, board_post_id :: Int64
, creation_time :: UTCTime
, thread_id :: Int64
, board_thread_id :: Integer
} deriving (Show, Generic, FromJSON)

101
src/PriorityQueue.hs Normal file
View File

@ -0,0 +1,101 @@
{-
The purpose of this module is to provide a way to store things
in order (for example as most recent), take the nth item in
the order out, and put a new item in. In O(log n) time.
There is also a function to choose a random integer, but favours lower ones.
Together, this lets us process data in a safe way asynchronously,
where if we keep choosing a skewed random item to process all of the items
eventually get processed. No two threads will process the same thing at the
same time, and more popular things get processed more often.
-}
module PriorityQueue
( Elem (..)
, Queue
, take
, put
, selectSkewedIndex
, main
)
where
import Prelude hiding (splitAt, take, min, max, elem)
import Data.Set hiding (take, foldr, map)
import Data.Ord (comparing)
import System.Random (StdGen, getStdGen, randomR)
import Data.List (sort, group)
data Elem a = Elem
{ priority :: Int
, element :: a
}
instance Ord (Elem a) where
compare = comparing priority
instance Eq (Elem a) where
(==) x y = priority x == priority y
type Queue a = Set a
take :: (Ord a) => Int -> Queue a -> (a, Queue a)
take n set =
let (_, greater) = splitAt (size set - n - 1) set
elem = findMin greater
in (elem, delete elem set)
put :: (Ord a) => a -> Queue a -> Queue a
put = insert
-- Simplified function to generate a number linearly skewed towards the start of the range
linearSkewRandom :: Double -> Double -> StdGen -> (Double, StdGen)
linearSkewRandom min max rng =
let (u, rng') = randomR (0.0, 1.0) rng
-- skewedValue = min + (u ** 2) * (max - min)
skewedValue = (min - 0.5) + (u ** 2) * (max - min + 0.5)
-- skewedValue = (min - 0.5) + (1 - sqrt u) * (max - min + 0.5)
-- skewedValue = min + (1 - sqrt u) * (max - min)
in (skewedValue, rng')
-- Function to select an index from 0 to n-1 with a linear skew towards lower numbers
selectSkewedIndex :: Int -> StdGen -> (Int, StdGen)
selectSkewedIndex n rng =
let max = fromIntegral (n - 1)
(randValue, newRng) = linearSkewRandom 0 max rng
in (ceiling randValue, newRng)
main :: IO ()
main = do
stdGen <- getStdGen
--putStrLn "Hello World"
-- let i = fst $ selectSkewedIndex (size q) stdGen
-- let x = fst $ take i q
-- print (i, priority x)
let rs = foldr f ([], stdGen) ([1..100000] :: [ Int ])
mapM_ pf $ countOccurrences $ fst rs
where
pf :: (Show a, Show b) => (a, b) -> IO ()
pf (a, b) = putStrLn $ show a ++ "," ++ show b
f _ (xs, gen) =
let (x, newgen) = selectSkewedIndex (size q) gen
in (x:xs, newgen)
q :: Queue (Elem Int)
q = fromList [ Elem i undefined | i <- [1..100] ]
countOccurrences :: (Eq a, Ord a) => [a] -> [(a, Int)]
countOccurrences = map (\x -> (head x, length x)) . group . sort

View File

@ -11,5 +11,5 @@ data Site = Site
{ site_id :: Int
, name :: String
, url :: String
} deriving (Show, Generic, FromJSON)
} deriving (Show, Eq, Generic, FromJSON)

157
src/Sync.hs Normal file
View File

@ -0,0 +1,157 @@
{-# LANGUAGE RecordWildCards #-}
module Sync where
import System.Exit (exitFailure)
import qualified Data.Map as Map
import qualified Data.Set as Set
import Data.Maybe (mapMaybe)
-- import Control.Monad.Trans.Except (ExceptT, runExceptT)
import qualified Common.Server.ConsumerSettings as S
import qualified Common.Server.JSONSettings as JS
import qualified Network.DataClient as Client
import qualified Lib
import qualified Network.GetLatestPostsPerBoardResponse as GLPPBR
import qualified SitesType as Site
import qualified BoardsType as Board
import qualified BoardQueueElem as QE
import qualified PriorityQueue as PQ
consumerSettingsToPartialJSONSettings :: S.ConsumerJSONSettings -> JS.JSONSettings
consumerSettingsToPartialJSONSettings S.ConsumerJSONSettings {..} =
JS.JSONSettings
{ JS.postgrest_url = postgrest_url
, JS.jwt = jwt
, backup_read_root = undefined
, JS.media_root_path
, site_name = undefined
, site_url = undefined
}
syncWebsites :: S.ConsumerJSONSettings -> IO ()
syncWebsites consumer_settings = do
putStrLn "Starting channel web synchronization."
let json_settings = consumerSettingsToPartialJSONSettings consumer_settings
sitesResult <- Client.getAllSites json_settings
sites <- mapM (flip Lib.ensureSiteExists sitesResult . Lib.toClientSettings consumer_settings) (S.websites consumer_settings)
print sites
latest_posts_per_board_results <- Client.getLatestPostsPerBoard json_settings
latest_posts_per_board <- case latest_posts_per_board_results of
Left e -> do
putStrLn $ "Error getting board information: " ++ show e
exitFailure
Right latest_posts_per_board -> return latest_posts_per_board
print latest_posts_per_board
let boards_per_site :: Map.Map Int [ String ] =
foldl
(\m b ->
let key = GLPPBR.site_id b
pathpart = GLPPBR.pathpart b
in
Map.insertWith (++) key [ pathpart ] m
)
Map.empty
latest_posts_per_board
let board_id_to_last_modified = Map.fromList $
map
(\b -> (GLPPBR.board_id b, GLPPBR.creation_time b))
latest_posts_per_board
let site_name_to_site :: Map.Map String Site.Site =
Map.fromList $ map (\s -> (Site.name s, s)) sites
let site_id_board_id_to_glppbr = Map.fromList $
map
(\b -> ((GLPPBR.site_id b, GLPPBR.pathpart b), b))
latest_posts_per_board
site_and_board_list_ <- mapM
(\site_settings -> do
let site = (Map.!) site_name_to_site (S.name site_settings)
let s_id = Site.site_id site
let existing_board_info =
mapMaybe
(\board_pathpart ->
Map.lookup (s_id, board_pathpart) site_id_board_id_to_glppbr
)
(S.boards site_settings)
let existing_boards =
map
(\b -> Board.Board
{ Board.board_id = GLPPBR.board_id b
, Board.name = Nothing
, Board.pathpart = GLPPBR.pathpart b
, Board.site_id = GLPPBR.site_id b
}
)
existing_board_info
boards <- Lib.createArchivesForNewBoards
(Lib.toClientSettings consumer_settings site_settings)
(Set.fromList $ S.boards site_settings)
((Map.!) boards_per_site s_id)
s_id
return (site, existing_boards ++ boards)
)
(S.websites consumer_settings)
let site_and_board_list = concatMap (\(a, bs) -> map (\b -> (a, b)) bs) site_and_board_list_
let queue_elems =
map
(\(site, board) -> QE.BoardQueueElem
{ QE.site = site
, QE.board = board
, QE.last_modified =
(Map.!)
board_id_to_last_modified
(Board.board_id board)
}
)
site_and_board_list
let pq :: PQ.Queue QE.BoardQueueElem = Set.fromList queue_elems
print pq
-- we have our boards last modified timestamps
-- get list of boards per site
-- first we need all the (Site, Board) tuples ✓
-- perhaps we even want all (Site, Board, Thread) ✓
-- But then we don't load the posts of each thread, instead only do
-- that for threads which change,
-- - which means after we get all the threads
-- - enter a loop where you
-- - pick a board
-- - compare the threads online to memory
-- - load only the changed/new ones
-- - put board back
-- NEW TODO:
-- - ensure that sites in the settings exist in the database! ✓
-- - ensure that boards per site in the settings exist in the database! ✓
-- - finish using ExceptT and use sites, latest_posts_per_board to populate
-- our PriorityQueue
-- - write event loop that
-- - get pq from stm shared value
-- - uses the pq (there was something about the timestamps in the pq having to be reversed btw)
-- - ensures threads
-- - has a value that should be added to the pq
-- - uses stm to update pq shared value