Can read in the whole site from the api json files

This commit is contained in:
towards-a-new-leftypol 2023-10-13 14:43:02 -04:00
parent 7bab7ea3f3
commit c18be31289
10 changed files with 204 additions and 34 deletions

View File

@ -68,12 +68,13 @@ executable chan-delorean
other-modules: other-modules:
JSONParsing JSONParsing
DataClient DataClient
Types JSONSettings
SitesType SitesType
BoardsType BoardsType
ThreadType ThreadType
JSONPost JSONPost
JSONCommonTypes JSONCommonTypes
PostsType
-- LANGUAGE extensions used by modules in this package. -- LANGUAGE extensions used by modules in this package.
-- other-extensions: -- other-extensions:

View File

@ -9,7 +9,7 @@ let
}: }:
mkDerivation { mkDerivation {
pname = "chan-delorean"; pname = "chan-delorean";
version = "0.0.0.0"; version = "0.0.2";
src = ./.; src = ./.;
isLibrary = false; isLibrary = false;
isExecutable = true; isExecutable = true;

View File

@ -110,6 +110,69 @@ CREATE INDEX attachments_post_id_idx ON attachments (post_id);
CREATE INDEX attachments_md5_hash_idx ON attachments (md5_hash); CREATE INDEX attachments_md5_hash_idx ON attachments (md5_hash);
CREATE INDEX attachments_phash_bktree_index ON attachments USING spgist (phash bktree_ops); CREATE INDEX attachments_phash_bktree_index ON attachments USING spgist (phash bktree_ops);
/*
* Function Definitions
*/
/*
CREATE OR REPLACE FUNCTION insert_posts_and_return_ids(new_posts posts[])
RETURNS TABLE (post_id bigint, board_post_id bigint) AS $$
WITH inserted AS (
INSERT INTO posts (board_post_id, creation_time, body, thread_id)
SELECT np.board_post_id, np.creation_time, np.body, np.thread_id
FROM unnest(new_posts) AS np
ON CONFLICT (thread_id, board_post_id) DO NOTHING
RETURNING post_id, board_post_id
),
selected AS (
SELECT post_id, board_post_id
FROM posts
WHERE (thread_id, board_post_id) IN (SELECT thread_id, board_post_id FROM unnest(new_posts))
)
SELECT * FROM inserted
UNION ALL
SELECT * FROM selected WHERE (post_id, board_post_id) NOT IN (SELECT post_id, board_post_id FROM inserted);
$$ LANGUAGE sql;
-- 3m37s for clean db
-- 1m34s for full db (nothing inserted)
*/
CREATE OR REPLACE FUNCTION insert_posts_and_return_ids(new_posts posts[])
RETURNS TABLE (post_id bigint, board_post_id bigint, thread_id bigint) AS $$
WITH
selected AS (
SELECT post_id, board_post_id, thread_id
FROM posts
WHERE (thread_id, board_post_id) IN (SELECT thread_id, board_post_id FROM unnest(new_posts))
),
to_insert AS (
SELECT np.*
FROM unnest(new_posts) AS np
LEFT OUTER JOIN selected s ON np.thread_id = s.thread_id AND np.board_post_id = s.board_post_id
WHERE s.post_id IS NULL
),
inserted AS (
INSERT INTO posts (board_post_id, creation_time, body, thread_id)
SELECT board_post_id, creation_time, body, thread_id
FROM to_insert
RETURNING post_id, board_post_id, thread_id
)
SELECT * FROM inserted
UNION ALL
SELECT * FROM selected;
$$ LANGUAGE sql;
-- 1:51 for clean db (this varies a lot)
-- 1:21 for full db (nothing inserted)
/*
* Permissions
*/
CREATE ROLE chan_archive_anon nologin; CREATE ROLE chan_archive_anon nologin;
GRANT CONNECT ON DATABASE chan_archives TO chan_archive_anon; GRANT CONNECT ON DATABASE chan_archives TO chan_archive_anon;
GRANT SELECT ON sites TO chan_archive_anon; GRANT SELECT ON sites TO chan_archive_anon;
@ -130,9 +193,12 @@ GRANT ALL ON threads TO chan_archiver;
GRANT ALL ON posts TO chan_archiver; GRANT ALL ON posts TO chan_archiver;
GRANT ALL ON attachments TO chan_archiver; GRANT ALL ON attachments TO chan_archiver;
GRANT EXECUTE ON FUNCTION update_post_body_search_index TO chan_archiver; GRANT EXECUTE ON FUNCTION update_post_body_search_index TO chan_archiver;
GRANT usage, select ON SEQUENCE sites_site_id_seq TO chan_archiver; GRANT EXECUTE ON FUNCTION insert_posts_and_return_ids TO chan_archiver;
GRANT usage, select ON SEQUENCE boards_board_id_seq TO chan_archiver; GRANT usage, select ON SEQUENCE sites_site_id_seq TO chan_archiver;
GRANT usage, select ON SEQUENCE threads_thread_id_seq TO chan_archiver; GRANT usage, select ON SEQUENCE boards_board_id_seq TO chan_archiver;
GRANT usage, select ON SEQUENCE threads_thread_id_seq TO chan_archiver;
GRANT usage, select ON SEQUENCE posts_post_id_seq TO chan_archiver;
GRANT chan_archiver TO admin; GRANT chan_archiver TO admin;
COMMIT; COMMIT;

View File

@ -12,14 +12,14 @@ import qualified Data.Set as Set
import Data.Time.Clock.POSIX (posixSecondsToUTCTime) import Data.Time.Clock.POSIX (posixSecondsToUTCTime)
import Data.Time.Clock (UTCTime) import Data.Time.Clock (UTCTime)
import JSONParsing import JSONParsing
import Types import JSONSettings
import qualified JSONPost as JSONPosts
import qualified DataClient as Client import qualified DataClient as Client
import qualified SitesType as Sites import qualified SitesType as Sites
import qualified BoardsType as Boards import qualified BoardsType as Boards
import qualified ThreadType as Threads import qualified ThreadType as Threads
import qualified JSONPost as JSONPosts import qualified PostsType as Posts
data SettingsCLI = SettingsCLI data SettingsCLI = SettingsCLI
{ jsonFile :: FilePath { jsonFile :: FilePath
@ -28,7 +28,7 @@ data SettingsCLI = SettingsCLI
settingsCLI :: SettingsCLI settingsCLI :: SettingsCLI
settingsCLI = SettingsCLI settingsCLI = SettingsCLI
{ jsonFile = def &= args &= typ "settings-jsonfile-path" { jsonFile = def &= args &= typ "settings-jsonfile-path"
} &= summary "Backfill v0.0.1" } &= summary "Backfill v0.0.2"
listCatalogDirectories :: JSONSettings -> IO [FilePath] listCatalogDirectories :: JSONSettings -> IO [FilePath]
@ -160,7 +160,12 @@ ensureThreads settings board all_threads = do
new_threads <- createArchivesForNewThreads settings all_threads archived_threads board new_threads <- createArchivesForNewThreads settings all_threads archived_threads board
return $ archived_threads ++ new_threads return $ archived_threads ++ new_threads
readPosts :: JSONSettings -> Boards.Board -> Threads.Thread -> IO [ JSONPosts.Post ]
readPosts
:: JSONSettings
-> Boards.Board
-> Threads.Thread
-> IO (Threads.Thread, [ JSONPosts.Post ])
readPosts settings board thread = do readPosts settings board thread = do
result <- parsePosts thread_filename result <- parsePosts thread_filename
@ -168,7 +173,7 @@ readPosts settings board thread = do
Left err -> do Left err -> do
putStrLn $ "Failed to parse the JSON file " ++ thread_filename ++ " error: " ++ err putStrLn $ "Failed to parse the JSON file " ++ thread_filename ++ " error: " ++ err
exitFailure exitFailure
Right posts_wrapper -> return $ JSONPosts.posts posts_wrapper Right posts_wrapper -> return $ (thread, JSONPosts.posts posts_wrapper)
where where
thread_filename :: FilePath thread_filename :: FilePath
@ -178,6 +183,40 @@ readPosts settings board thread = do
backupDir = backup_read_root settings </> (Boards.pathpart board) backupDir = backup_read_root settings </> (Boards.pathpart board)
ensurePosts
:: JSONSettings
-> Boards.Board
-> [(Threads.Thread, [ Posts.Post ])]
-> IO [(Threads.Thread, [ Posts.Post ])]
ensurePosts = undefined
-- Convert Post to DbPost
apiPostToArchivePost :: Threads.Thread -> JSONPosts.Post -> Posts.Post
apiPostToArchivePost thread post =
Posts.Post
{ Posts.post_id = Nothing
, Posts.board_post_id = JSONPosts.no post
, Posts.creation_time = posixSecondsToUTCTime (realToFrac $ JSONPosts.time post)
, Posts.body = JSONPosts.com post
, Posts.thread_id = Threads.thread_id thread
}
-- | A version of 'concatMap' that works with a monadic predicate.
-- Stolen from package extra Control.Monad.Extra
concatMapM :: Monad m => (a -> m [b]) -> [a] -> m [b]
{-# INLINE concatMapM #-}
concatMapM op = foldr f (pure [])
where f x xs = do
x_ <- op x
if null x_
then xs
else do
xs_ <- xs
pure $ x_ ++ xs_
processBoard :: JSONSettings -> Boards.Board -> IO () processBoard :: JSONSettings -> Boards.Board -> IO ()
processBoard settings board = do processBoard settings board = do
let catalogPath = backupDir </> "catalog.json" let catalogPath = backupDir </> "catalog.json"
@ -190,6 +229,26 @@ processBoard settings board = do
let threads_on_board = concatMap threads catalogs let threads_on_board = concatMap threads catalogs
all_threads_for_board :: [ Threads.Thread ] <- ensureThreads settings board threads_on_board all_threads_for_board :: [ Threads.Thread ] <- ensureThreads settings board threads_on_board
all_posts_on_board :: [(Threads.Thread, [ JSONPosts.Post ])] <- mapM (readPosts settings board) all_threads_for_board
-- putStrLn $ "Number of posts on /" ++ (Boards.pathpart board) ++ "/ " ++ (show $ length all_posts_on_board)
posts_result <- Client.postPosts settings (concatMap (\(t, posts) -> map (apiPostToArchivePost t) posts) all_posts_on_board)
-- TODO: why doesn't it insert posts for threads that already exist? we can have new posts!
case posts_result of
Left err -> print err
Right new_ids -> do
putStrLn "Sum of post_ids:"
print $ sum $ map Client.post_id new_ids
putStrLn "Sum of board_post_ids:"
print $ sum $ map Client.board_post_id new_ids
-- max: 18,645
-- min: 147
-- total: 191,628
--
-- f :: Threads.Thread -> [ Posts.Post ] -- f :: Threads.Thread -> [ Posts.Post ]
-- for each thread we have to call a function that -- for each thread we have to call a function that
-- - reads the thread under the board directory: -- - reads the thread under the board directory:

View File

@ -1,7 +1,9 @@
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE DeriveAnyClass #-}
module DataClient module DataClient
( HttpError(..) ( HttpError(..)
, PostId (..)
, get , get
, getSiteBoards , getSiteBoards
, getAllSites , getAllSites
@ -10,8 +12,10 @@ module DataClient
, postBoards , postBoards
, getThreads , getThreads
, postThreads , postThreads
, postPosts
) where ) where
import Data.Int (Int64)
import Network.HTTP.Simple import Network.HTTP.Simple
import qualified Data.ByteString.Lazy as LBS import qualified Data.ByteString.Lazy as LBS
import qualified Data.ByteString.Lazy.Char8 as LC8 import qualified Data.ByteString.Lazy.Char8 as LC8
@ -27,17 +31,24 @@ import Data.Aeson
, encode , encode
, Value , Value
) )
import GHC.Generics
import qualified Types as T import qualified JSONSettings as T
import qualified SitesType as Sites import qualified SitesType as Sites
import qualified BoardsType as Boards import qualified BoardsType as Boards
import qualified ThreadType as Threads import qualified ThreadType as Threads
import qualified PostsType as Posts
data HttpError data HttpError
= HttpException SomeException = HttpException SomeException
| StatusCodeError Int LBS.ByteString | StatusCodeError Int LBS.ByteString
deriving (Show) deriving (Show)
data PostId = PostId
{ post_id :: Int64
, board_post_id :: Int64
, thread_id :: Int64
} deriving (Show, Generic, FromJSON)
get :: T.JSONSettings -> String -> IO (Either HttpError LBS.ByteString) get :: T.JSONSettings -> String -> IO (Either HttpError LBS.ByteString)
get settings path = do get settings path = do
@ -151,6 +162,17 @@ getThreads settings board_id board_thread_ids =
path = "/threads?board_thread_id=in.(" ++ ids ++ ")&board_id=eq." ++ show board_id path = "/threads?board_thread_id=in.(" ++ ids ++ ")&board_id=eq." ++ show board_id
ids :: String = intercalate "," $ map show board_thread_ids ids :: String = intercalate "," $ map show board_thread_ids
postPosts
:: T.JSONSettings
-> [ Posts.Post ]
-> IO (Either HttpError [ PostId ])
postPosts settings posts =
post settings "/rpc/insert_posts_and_return_ids" payload True >>= return . eitherDecodeResponse
where
payload = encode $ object [ "new_posts" .= posts ]
eitherDecodeResponse :: (FromJSON a) => Either HttpError LBS.ByteString -> Either HttpError a eitherDecodeResponse :: (FromJSON a) => Either HttpError LBS.ByteString -> Either HttpError a
eitherDecodeResponse (Left err) = Left err eitherDecodeResponse (Left err) = Left err
eitherDecodeResponse (Right bs) = eitherDecodeResponse (Right bs) =

View File

@ -1,14 +1,29 @@
module JSONCommonTypes module JSONCommonTypes
( File (..) ( File (..)
, Cyclical (..)
) where ) where
import Data.Text (Text) import Data.Text (Text)
import Data.Aeson (FromJSON) import qualified Data.Text as T
import Data.Aeson
import Data.Aeson.Types (typeMismatch)
import GHC.Generics import GHC.Generics
data Cyclical = Cyclical Int deriving (Show, Generic)
instance FromJSON Cyclical where
parseJSON (Number n) = return $ Cyclical (floor n)
parseJSON (String s) =
case reads (T.unpack s) :: [(Int, String)] of
[(n, "")] -> return $ Cyclical n
_ -> typeMismatch "Int or String containing Int" (String s)
parseJSON invalid = typeMismatch "Int or String" invalid
data File = File data File = File
{ id :: Text { id :: Text
, mime :: Text , mime :: Maybe Text
, ext :: Text , ext :: Text
, h :: Maybe Int , h :: Maybe Int
, w :: Maybe Int , w :: Maybe Int

View File

@ -9,26 +9,13 @@ module JSONParsing
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
import Data.Text (Text) import Data.Text (Text)
import Data.Aeson
import GHC.Generics import GHC.Generics
import qualified Data.ByteString.Lazy as B import qualified Data.ByteString.Lazy as B
import qualified Data.Text as T import Data.Aeson
import Data.Aeson.Types (typeMismatch)
import qualified JSONPost as Post import qualified JSONPost as Post
import qualified JSONCommonTypes as J import qualified JSONCommonTypes as J
data Cyclical = Cyclical Int deriving (Show, Generic)
instance FromJSON Cyclical where
parseJSON (Number n) = return $ Cyclical (floor n)
parseJSON (String s) =
case reads (T.unpack s) :: [(Int, String)] of
[(n, "")] -> return $ Cyclical n
_ -> typeMismatch "Int or String containing Int" (String s)
parseJSON invalid = typeMismatch "Int or String" invalid
data Thread = Thread data Thread = Thread
{ no :: Int { no :: Int
, sub :: Maybe Text , sub :: Maybe Text
@ -42,7 +29,7 @@ data Thread = Thread
, images :: Maybe Int , images :: Maybe Int
, sticky :: Maybe Int , sticky :: Maybe Int
, locked :: Maybe Int , locked :: Maybe Int
, cyclical :: Maybe Cyclical , cyclical :: Maybe J.Cyclical
, last_modified :: Int , last_modified :: Int
, board :: Text , board :: Text
, files :: Maybe [J.File] , files :: Maybe [J.File]

View File

@ -4,12 +4,13 @@ module JSONPost
) where ) where
import Data.Text (Text) import Data.Text (Text)
import Data.Int (Int64)
import Data.Aeson (FromJSON) import Data.Aeson (FromJSON)
import GHC.Generics import GHC.Generics
import qualified JSONCommonTypes as J import qualified JSONCommonTypes as J
data Post = Post data Post = Post
{ no :: Int { no :: Int64
, com :: Maybe Text , com :: Maybe Text
, name :: Maybe Text , name :: Maybe Text
, time :: Int , time :: Int
@ -17,7 +18,7 @@ data Post = Post
, omitted_images :: Maybe Int , omitted_images :: Maybe Int
, sticky :: Maybe Int , sticky :: Maybe Int
, locked :: Maybe Int , locked :: Maybe Int
, cyclical :: Maybe Int , cyclical :: Maybe J.Cyclical
, last_modified :: Int , last_modified :: Int
, board :: String , board :: String
, files :: Maybe [J.File] , files :: Maybe [J.File]

View File

@ -1,4 +1,4 @@
module Types module JSONSettings
( JSONSettings(..) ( JSONSettings(..)
) where ) where

19
src/PostsType.hs Normal file
View File

@ -0,0 +1,19 @@
{-# LANGUAGE DeriveAnyClass #-}
module PostsType
( Post (..) )
where
import GHC.Generics
import Data.Aeson (FromJSON, ToJSON)
import Data.Time.Clock (UTCTime) -- Required for timestamp with time zone
import Data.Int (Int64)
import Data.Text (Text)
data Post = Post
{ post_id :: Maybe Int64
, board_post_id :: Int64
, creation_time :: UTCTime
, body :: Maybe Text
, thread_id :: Int
} deriving (Show, Generic, FromJSON, ToJSON)