Can read in the whole site from the api json files

This commit is contained in:
towards-a-new-leftypol 2023-10-13 14:43:02 -04:00
parent 7bab7ea3f3
commit c18be31289
10 changed files with 204 additions and 34 deletions

View File

@ -68,12 +68,13 @@ executable chan-delorean
other-modules:
JSONParsing
DataClient
Types
JSONSettings
SitesType
BoardsType
ThreadType
JSONPost
JSONCommonTypes
PostsType
-- LANGUAGE extensions used by modules in this package.
-- other-extensions:

View File

@ -9,7 +9,7 @@ let
}:
mkDerivation {
pname = "chan-delorean";
version = "0.0.0.0";
version = "0.0.2";
src = ./.;
isLibrary = false;
isExecutable = true;

View File

@ -110,6 +110,69 @@ CREATE INDEX attachments_post_id_idx ON attachments (post_id);
CREATE INDEX attachments_md5_hash_idx ON attachments (md5_hash);
CREATE INDEX attachments_phash_bktree_index ON attachments USING spgist (phash bktree_ops);
/*
* Function Definitions
*/
/*
CREATE OR REPLACE FUNCTION insert_posts_and_return_ids(new_posts posts[])
RETURNS TABLE (post_id bigint, board_post_id bigint) AS $$
WITH inserted AS (
INSERT INTO posts (board_post_id, creation_time, body, thread_id)
SELECT np.board_post_id, np.creation_time, np.body, np.thread_id
FROM unnest(new_posts) AS np
ON CONFLICT (thread_id, board_post_id) DO NOTHING
RETURNING post_id, board_post_id
),
selected AS (
SELECT post_id, board_post_id
FROM posts
WHERE (thread_id, board_post_id) IN (SELECT thread_id, board_post_id FROM unnest(new_posts))
)
SELECT * FROM inserted
UNION ALL
SELECT * FROM selected WHERE (post_id, board_post_id) NOT IN (SELECT post_id, board_post_id FROM inserted);
$$ LANGUAGE sql;
-- 3m37s for clean db
-- 1m34s for full db (nothing inserted)
*/
CREATE OR REPLACE FUNCTION insert_posts_and_return_ids(new_posts posts[])
RETURNS TABLE (post_id bigint, board_post_id bigint, thread_id bigint) AS $$
WITH
selected AS (
SELECT post_id, board_post_id, thread_id
FROM posts
WHERE (thread_id, board_post_id) IN (SELECT thread_id, board_post_id FROM unnest(new_posts))
),
to_insert AS (
SELECT np.*
FROM unnest(new_posts) AS np
LEFT OUTER JOIN selected s ON np.thread_id = s.thread_id AND np.board_post_id = s.board_post_id
WHERE s.post_id IS NULL
),
inserted AS (
INSERT INTO posts (board_post_id, creation_time, body, thread_id)
SELECT board_post_id, creation_time, body, thread_id
FROM to_insert
RETURNING post_id, board_post_id, thread_id
)
SELECT * FROM inserted
UNION ALL
SELECT * FROM selected;
$$ LANGUAGE sql;
-- 1:51 for clean db (this varies a lot)
-- 1:21 for full db (nothing inserted)
/*
* Permissions
*/
CREATE ROLE chan_archive_anon nologin;
GRANT CONNECT ON DATABASE chan_archives TO chan_archive_anon;
GRANT SELECT ON sites TO chan_archive_anon;
@ -130,9 +193,12 @@ GRANT ALL ON threads TO chan_archiver;
GRANT ALL ON posts TO chan_archiver;
GRANT ALL ON attachments TO chan_archiver;
GRANT EXECUTE ON FUNCTION update_post_body_search_index TO chan_archiver;
GRANT usage, select ON SEQUENCE sites_site_id_seq TO chan_archiver;
GRANT usage, select ON SEQUENCE boards_board_id_seq TO chan_archiver;
GRANT usage, select ON SEQUENCE threads_thread_id_seq TO chan_archiver;
GRANT EXECUTE ON FUNCTION insert_posts_and_return_ids TO chan_archiver;
GRANT usage, select ON SEQUENCE sites_site_id_seq TO chan_archiver;
GRANT usage, select ON SEQUENCE boards_board_id_seq TO chan_archiver;
GRANT usage, select ON SEQUENCE threads_thread_id_seq TO chan_archiver;
GRANT usage, select ON SEQUENCE posts_post_id_seq TO chan_archiver;
GRANT chan_archiver TO admin;
COMMIT;

View File

@ -12,14 +12,14 @@ import qualified Data.Set as Set
import Data.Time.Clock.POSIX (posixSecondsToUTCTime)
import Data.Time.Clock (UTCTime)
import JSONParsing
import Types
import JSONSettings
import qualified JSONPost as JSONPosts
import qualified DataClient as Client
import qualified SitesType as Sites
import qualified SitesType as Sites
import qualified BoardsType as Boards
import qualified ThreadType as Threads
import qualified JSONPost as JSONPosts
import qualified PostsType as Posts
data SettingsCLI = SettingsCLI
{ jsonFile :: FilePath
@ -28,7 +28,7 @@ data SettingsCLI = SettingsCLI
settingsCLI :: SettingsCLI
settingsCLI = SettingsCLI
{ jsonFile = def &= args &= typ "settings-jsonfile-path"
} &= summary "Backfill v0.0.1"
} &= summary "Backfill v0.0.2"
listCatalogDirectories :: JSONSettings -> IO [FilePath]
@ -160,7 +160,12 @@ ensureThreads settings board all_threads = do
new_threads <- createArchivesForNewThreads settings all_threads archived_threads board
return $ archived_threads ++ new_threads
readPosts :: JSONSettings -> Boards.Board -> Threads.Thread -> IO [ JSONPosts.Post ]
readPosts
:: JSONSettings
-> Boards.Board
-> Threads.Thread
-> IO (Threads.Thread, [ JSONPosts.Post ])
readPosts settings board thread = do
result <- parsePosts thread_filename
@ -168,16 +173,50 @@ readPosts settings board thread = do
Left err -> do
putStrLn $ "Failed to parse the JSON file " ++ thread_filename ++ " error: " ++ err
exitFailure
Right posts_wrapper -> return $ JSONPosts.posts posts_wrapper
Right posts_wrapper -> return $ (thread, JSONPosts.posts posts_wrapper)
where
thread_filename :: FilePath
thread_filename :: FilePath
thread_filename = backupDir </> "res" </> ((show $ Threads.board_thread_id thread) ++ ".json")
backupDir :: FilePath
backupDir = backup_read_root settings </> (Boards.pathpart board)
ensurePosts
:: JSONSettings
-> Boards.Board
-> [(Threads.Thread, [ Posts.Post ])]
-> IO [(Threads.Thread, [ Posts.Post ])]
ensurePosts = undefined
-- Convert Post to DbPost
apiPostToArchivePost :: Threads.Thread -> JSONPosts.Post -> Posts.Post
apiPostToArchivePost thread post =
Posts.Post
{ Posts.post_id = Nothing
, Posts.board_post_id = JSONPosts.no post
, Posts.creation_time = posixSecondsToUTCTime (realToFrac $ JSONPosts.time post)
, Posts.body = JSONPosts.com post
, Posts.thread_id = Threads.thread_id thread
}
-- | A version of 'concatMap' that works with a monadic predicate.
-- Stolen from package extra Control.Monad.Extra
concatMapM :: Monad m => (a -> m [b]) -> [a] -> m [b]
{-# INLINE concatMapM #-}
concatMapM op = foldr f (pure [])
where f x xs = do
x_ <- op x
if null x_
then xs
else do
xs_ <- xs
pure $ x_ ++ xs_
processBoard :: JSONSettings -> Boards.Board -> IO ()
processBoard settings board = do
let catalogPath = backupDir </> "catalog.json"
@ -190,6 +229,26 @@ processBoard settings board = do
let threads_on_board = concatMap threads catalogs
all_threads_for_board :: [ Threads.Thread ] <- ensureThreads settings board threads_on_board
all_posts_on_board :: [(Threads.Thread, [ JSONPosts.Post ])] <- mapM (readPosts settings board) all_threads_for_board
-- putStrLn $ "Number of posts on /" ++ (Boards.pathpart board) ++ "/ " ++ (show $ length all_posts_on_board)
posts_result <- Client.postPosts settings (concatMap (\(t, posts) -> map (apiPostToArchivePost t) posts) all_posts_on_board)
-- TODO: why doesn't it insert posts for threads that already exist? we can have new posts!
case posts_result of
Left err -> print err
Right new_ids -> do
putStrLn "Sum of post_ids:"
print $ sum $ map Client.post_id new_ids
putStrLn "Sum of board_post_ids:"
print $ sum $ map Client.board_post_id new_ids
-- max: 18,645
-- min: 147
-- total: 191,628
--
-- f :: Threads.Thread -> [ Posts.Post ]
-- for each thread we have to call a function that
-- - reads the thread under the board directory:

View File

@ -1,7 +1,9 @@
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE DeriveAnyClass #-}
module DataClient
( HttpError(..)
, PostId (..)
, get
, getSiteBoards
, getAllSites
@ -10,8 +12,10 @@ module DataClient
, postBoards
, getThreads
, postThreads
, postPosts
) where
import Data.Int (Int64)
import Network.HTTP.Simple
import qualified Data.ByteString.Lazy as LBS
import qualified Data.ByteString.Lazy.Char8 as LC8
@ -27,17 +31,24 @@ import Data.Aeson
, encode
, Value
)
import GHC.Generics
import qualified Types as T
import qualified JSONSettings as T
import qualified SitesType as Sites
import qualified BoardsType as Boards
import qualified ThreadType as Threads
import qualified PostsType as Posts
data HttpError
= HttpException SomeException
| StatusCodeError Int LBS.ByteString
deriving (Show)
data PostId = PostId
{ post_id :: Int64
, board_post_id :: Int64
, thread_id :: Int64
} deriving (Show, Generic, FromJSON)
get :: T.JSONSettings -> String -> IO (Either HttpError LBS.ByteString)
get settings path = do
@ -151,6 +162,17 @@ getThreads settings board_id board_thread_ids =
path = "/threads?board_thread_id=in.(" ++ ids ++ ")&board_id=eq." ++ show board_id
ids :: String = intercalate "," $ map show board_thread_ids
postPosts
:: T.JSONSettings
-> [ Posts.Post ]
-> IO (Either HttpError [ PostId ])
postPosts settings posts =
post settings "/rpc/insert_posts_and_return_ids" payload True >>= return . eitherDecodeResponse
where
payload = encode $ object [ "new_posts" .= posts ]
eitherDecodeResponse :: (FromJSON a) => Either HttpError LBS.ByteString -> Either HttpError a
eitherDecodeResponse (Left err) = Left err
eitherDecodeResponse (Right bs) =

View File

@ -1,14 +1,29 @@
module JSONCommonTypes
( File (..)
, Cyclical (..)
) where
import Data.Text (Text)
import Data.Aeson (FromJSON)
import qualified Data.Text as T
import Data.Aeson
import Data.Aeson.Types (typeMismatch)
import GHC.Generics
data Cyclical = Cyclical Int deriving (Show, Generic)
instance FromJSON Cyclical where
parseJSON (Number n) = return $ Cyclical (floor n)
parseJSON (String s) =
case reads (T.unpack s) :: [(Int, String)] of
[(n, "")] -> return $ Cyclical n
_ -> typeMismatch "Int or String containing Int" (String s)
parseJSON invalid = typeMismatch "Int or String" invalid
data File = File
{ id :: Text
, mime :: Text
, mime :: Maybe Text
, ext :: Text
, h :: Maybe Int
, w :: Maybe Int

View File

@ -9,26 +9,13 @@ module JSONParsing
{-# LANGUAGE OverloadedStrings #-}
import Data.Text (Text)
import Data.Aeson
import GHC.Generics
import qualified Data.ByteString.Lazy as B
import qualified Data.Text as T
import Data.Aeson.Types (typeMismatch)
import Data.Aeson
import qualified JSONPost as Post
import qualified JSONCommonTypes as J
data Cyclical = Cyclical Int deriving (Show, Generic)
instance FromJSON Cyclical where
parseJSON (Number n) = return $ Cyclical (floor n)
parseJSON (String s) =
case reads (T.unpack s) :: [(Int, String)] of
[(n, "")] -> return $ Cyclical n
_ -> typeMismatch "Int or String containing Int" (String s)
parseJSON invalid = typeMismatch "Int or String" invalid
data Thread = Thread
{ no :: Int
, sub :: Maybe Text
@ -42,7 +29,7 @@ data Thread = Thread
, images :: Maybe Int
, sticky :: Maybe Int
, locked :: Maybe Int
, cyclical :: Maybe Cyclical
, cyclical :: Maybe J.Cyclical
, last_modified :: Int
, board :: Text
, files :: Maybe [J.File]

View File

@ -4,12 +4,13 @@ module JSONPost
) where
import Data.Text (Text)
import Data.Int (Int64)
import Data.Aeson (FromJSON)
import GHC.Generics
import qualified JSONCommonTypes as J
data Post = Post
{ no :: Int
{ no :: Int64
, com :: Maybe Text
, name :: Maybe Text
, time :: Int
@ -17,7 +18,7 @@ data Post = Post
, omitted_images :: Maybe Int
, sticky :: Maybe Int
, locked :: Maybe Int
, cyclical :: Maybe Int
, cyclical :: Maybe J.Cyclical
, last_modified :: Int
, board :: String
, files :: Maybe [J.File]

View File

@ -1,4 +1,4 @@
module Types
module JSONSettings
( JSONSettings(..)
) where

19
src/PostsType.hs Normal file
View File

@ -0,0 +1,19 @@
{-# LANGUAGE DeriveAnyClass #-}
module PostsType
( Post (..) )
where
import GHC.Generics
import Data.Aeson (FromJSON, ToJSON)
import Data.Time.Clock (UTCTime) -- Required for timestamp with time zone
import Data.Int (Int64)
import Data.Text (Text)
data Post = Post
{ post_id :: Maybe Int64
, board_post_id :: Int64
, creation_time :: UTCTime
, body :: Maybe Text
, thread_id :: Int
} deriving (Show, Generic, FromJSON, ToJSON)