Can read in the whole site from the api json files
This commit is contained in:
parent
7bab7ea3f3
commit
c18be31289
|
@ -68,12 +68,13 @@ executable chan-delorean
|
|||
other-modules:
|
||||
JSONParsing
|
||||
DataClient
|
||||
Types
|
||||
JSONSettings
|
||||
SitesType
|
||||
BoardsType
|
||||
ThreadType
|
||||
JSONPost
|
||||
JSONCommonTypes
|
||||
PostsType
|
||||
|
||||
-- LANGUAGE extensions used by modules in this package.
|
||||
-- other-extensions:
|
||||
|
|
|
@ -9,7 +9,7 @@ let
|
|||
}:
|
||||
mkDerivation {
|
||||
pname = "chan-delorean";
|
||||
version = "0.0.0.0";
|
||||
version = "0.0.2";
|
||||
src = ./.;
|
||||
isLibrary = false;
|
||||
isExecutable = true;
|
||||
|
|
|
@ -110,6 +110,69 @@ CREATE INDEX attachments_post_id_idx ON attachments (post_id);
|
|||
CREATE INDEX attachments_md5_hash_idx ON attachments (md5_hash);
|
||||
CREATE INDEX attachments_phash_bktree_index ON attachments USING spgist (phash bktree_ops);
|
||||
|
||||
|
||||
/*
|
||||
* Function Definitions
|
||||
*/
|
||||
|
||||
/*
|
||||
CREATE OR REPLACE FUNCTION insert_posts_and_return_ids(new_posts posts[])
|
||||
RETURNS TABLE (post_id bigint, board_post_id bigint) AS $$
|
||||
WITH inserted AS (
|
||||
INSERT INTO posts (board_post_id, creation_time, body, thread_id)
|
||||
SELECT np.board_post_id, np.creation_time, np.body, np.thread_id
|
||||
FROM unnest(new_posts) AS np
|
||||
ON CONFLICT (thread_id, board_post_id) DO NOTHING
|
||||
RETURNING post_id, board_post_id
|
||||
),
|
||||
selected AS (
|
||||
SELECT post_id, board_post_id
|
||||
FROM posts
|
||||
WHERE (thread_id, board_post_id) IN (SELECT thread_id, board_post_id FROM unnest(new_posts))
|
||||
)
|
||||
SELECT * FROM inserted
|
||||
UNION ALL
|
||||
SELECT * FROM selected WHERE (post_id, board_post_id) NOT IN (SELECT post_id, board_post_id FROM inserted);
|
||||
$$ LANGUAGE sql;
|
||||
|
||||
-- 3m37s for clean db
|
||||
-- 1m34s for full db (nothing inserted)
|
||||
|
||||
*/
|
||||
|
||||
CREATE OR REPLACE FUNCTION insert_posts_and_return_ids(new_posts posts[])
|
||||
RETURNS TABLE (post_id bigint, board_post_id bigint, thread_id bigint) AS $$
|
||||
WITH
|
||||
selected AS (
|
||||
SELECT post_id, board_post_id, thread_id
|
||||
FROM posts
|
||||
WHERE (thread_id, board_post_id) IN (SELECT thread_id, board_post_id FROM unnest(new_posts))
|
||||
),
|
||||
to_insert AS (
|
||||
SELECT np.*
|
||||
FROM unnest(new_posts) AS np
|
||||
LEFT OUTER JOIN selected s ON np.thread_id = s.thread_id AND np.board_post_id = s.board_post_id
|
||||
WHERE s.post_id IS NULL
|
||||
),
|
||||
inserted AS (
|
||||
INSERT INTO posts (board_post_id, creation_time, body, thread_id)
|
||||
SELECT board_post_id, creation_time, body, thread_id
|
||||
FROM to_insert
|
||||
RETURNING post_id, board_post_id, thread_id
|
||||
)
|
||||
SELECT * FROM inserted
|
||||
UNION ALL
|
||||
SELECT * FROM selected;
|
||||
$$ LANGUAGE sql;
|
||||
|
||||
-- 1:51 for clean db (this varies a lot)
|
||||
-- 1:21 for full db (nothing inserted)
|
||||
|
||||
|
||||
/*
|
||||
* Permissions
|
||||
*/
|
||||
|
||||
CREATE ROLE chan_archive_anon nologin;
|
||||
GRANT CONNECT ON DATABASE chan_archives TO chan_archive_anon;
|
||||
GRANT SELECT ON sites TO chan_archive_anon;
|
||||
|
@ -130,9 +193,12 @@ GRANT ALL ON threads TO chan_archiver;
|
|||
GRANT ALL ON posts TO chan_archiver;
|
||||
GRANT ALL ON attachments TO chan_archiver;
|
||||
GRANT EXECUTE ON FUNCTION update_post_body_search_index TO chan_archiver;
|
||||
GRANT usage, select ON SEQUENCE sites_site_id_seq TO chan_archiver;
|
||||
GRANT usage, select ON SEQUENCE boards_board_id_seq TO chan_archiver;
|
||||
GRANT usage, select ON SEQUENCE threads_thread_id_seq TO chan_archiver;
|
||||
GRANT EXECUTE ON FUNCTION insert_posts_and_return_ids TO chan_archiver;
|
||||
GRANT usage, select ON SEQUENCE sites_site_id_seq TO chan_archiver;
|
||||
GRANT usage, select ON SEQUENCE boards_board_id_seq TO chan_archiver;
|
||||
GRANT usage, select ON SEQUENCE threads_thread_id_seq TO chan_archiver;
|
||||
GRANT usage, select ON SEQUENCE posts_post_id_seq TO chan_archiver;
|
||||
|
||||
GRANT chan_archiver TO admin;
|
||||
|
||||
COMMIT;
|
||||
|
|
|
@ -12,14 +12,14 @@ import qualified Data.Set as Set
|
|||
import Data.Time.Clock.POSIX (posixSecondsToUTCTime)
|
||||
import Data.Time.Clock (UTCTime)
|
||||
|
||||
|
||||
import JSONParsing
|
||||
import Types
|
||||
import JSONSettings
|
||||
import qualified JSONPost as JSONPosts
|
||||
import qualified DataClient as Client
|
||||
import qualified SitesType as Sites
|
||||
import qualified SitesType as Sites
|
||||
import qualified BoardsType as Boards
|
||||
import qualified ThreadType as Threads
|
||||
import qualified JSONPost as JSONPosts
|
||||
import qualified PostsType as Posts
|
||||
|
||||
data SettingsCLI = SettingsCLI
|
||||
{ jsonFile :: FilePath
|
||||
|
@ -28,7 +28,7 @@ data SettingsCLI = SettingsCLI
|
|||
settingsCLI :: SettingsCLI
|
||||
settingsCLI = SettingsCLI
|
||||
{ jsonFile = def &= args &= typ "settings-jsonfile-path"
|
||||
} &= summary "Backfill v0.0.1"
|
||||
} &= summary "Backfill v0.0.2"
|
||||
|
||||
|
||||
listCatalogDirectories :: JSONSettings -> IO [FilePath]
|
||||
|
@ -160,7 +160,12 @@ ensureThreads settings board all_threads = do
|
|||
new_threads <- createArchivesForNewThreads settings all_threads archived_threads board
|
||||
return $ archived_threads ++ new_threads
|
||||
|
||||
readPosts :: JSONSettings -> Boards.Board -> Threads.Thread -> IO [ JSONPosts.Post ]
|
||||
|
||||
readPosts
|
||||
:: JSONSettings
|
||||
-> Boards.Board
|
||||
-> Threads.Thread
|
||||
-> IO (Threads.Thread, [ JSONPosts.Post ])
|
||||
readPosts settings board thread = do
|
||||
result <- parsePosts thread_filename
|
||||
|
||||
|
@ -168,16 +173,50 @@ readPosts settings board thread = do
|
|||
Left err -> do
|
||||
putStrLn $ "Failed to parse the JSON file " ++ thread_filename ++ " error: " ++ err
|
||||
exitFailure
|
||||
Right posts_wrapper -> return $ JSONPosts.posts posts_wrapper
|
||||
Right posts_wrapper -> return $ (thread, JSONPosts.posts posts_wrapper)
|
||||
|
||||
where
|
||||
thread_filename :: FilePath
|
||||
thread_filename :: FilePath
|
||||
thread_filename = backupDir </> "res" </> ((show $ Threads.board_thread_id thread) ++ ".json")
|
||||
|
||||
backupDir :: FilePath
|
||||
backupDir = backup_read_root settings </> (Boards.pathpart board)
|
||||
|
||||
|
||||
ensurePosts
|
||||
:: JSONSettings
|
||||
-> Boards.Board
|
||||
-> [(Threads.Thread, [ Posts.Post ])]
|
||||
-> IO [(Threads.Thread, [ Posts.Post ])]
|
||||
ensurePosts = undefined
|
||||
|
||||
|
||||
-- Convert Post to DbPost
|
||||
apiPostToArchivePost :: Threads.Thread -> JSONPosts.Post -> Posts.Post
|
||||
apiPostToArchivePost thread post =
|
||||
Posts.Post
|
||||
{ Posts.post_id = Nothing
|
||||
, Posts.board_post_id = JSONPosts.no post
|
||||
, Posts.creation_time = posixSecondsToUTCTime (realToFrac $ JSONPosts.time post)
|
||||
, Posts.body = JSONPosts.com post
|
||||
, Posts.thread_id = Threads.thread_id thread
|
||||
}
|
||||
|
||||
-- | A version of 'concatMap' that works with a monadic predicate.
|
||||
-- Stolen from package extra Control.Monad.Extra
|
||||
concatMapM :: Monad m => (a -> m [b]) -> [a] -> m [b]
|
||||
{-# INLINE concatMapM #-}
|
||||
concatMapM op = foldr f (pure [])
|
||||
where f x xs = do
|
||||
x_ <- op x
|
||||
|
||||
if null x_
|
||||
then xs
|
||||
else do
|
||||
xs_ <- xs
|
||||
pure $ x_ ++ xs_
|
||||
|
||||
|
||||
processBoard :: JSONSettings -> Boards.Board -> IO ()
|
||||
processBoard settings board = do
|
||||
let catalogPath = backupDir </> "catalog.json"
|
||||
|
@ -190,6 +229,26 @@ processBoard settings board = do
|
|||
let threads_on_board = concatMap threads catalogs
|
||||
|
||||
all_threads_for_board :: [ Threads.Thread ] <- ensureThreads settings board threads_on_board
|
||||
|
||||
all_posts_on_board :: [(Threads.Thread, [ JSONPosts.Post ])] <- mapM (readPosts settings board) all_threads_for_board
|
||||
|
||||
-- putStrLn $ "Number of posts on /" ++ (Boards.pathpart board) ++ "/ " ++ (show $ length all_posts_on_board)
|
||||
posts_result <- Client.postPosts settings (concatMap (\(t, posts) -> map (apiPostToArchivePost t) posts) all_posts_on_board)
|
||||
|
||||
-- TODO: why doesn't it insert posts for threads that already exist? we can have new posts!
|
||||
|
||||
case posts_result of
|
||||
Left err -> print err
|
||||
Right new_ids -> do
|
||||
putStrLn "Sum of post_ids:"
|
||||
print $ sum $ map Client.post_id new_ids
|
||||
putStrLn "Sum of board_post_ids:"
|
||||
print $ sum $ map Client.board_post_id new_ids
|
||||
|
||||
-- max: 18,645
|
||||
-- min: 147
|
||||
-- total: 191,628
|
||||
--
|
||||
-- f :: Threads.Thread -> [ Posts.Post ]
|
||||
-- for each thread we have to call a function that
|
||||
-- - reads the thread under the board directory:
|
||||
|
|
|
@ -1,7 +1,9 @@
|
|||
{-# LANGUAGE OverloadedStrings #-}
|
||||
{-# LANGUAGE DeriveAnyClass #-}
|
||||
|
||||
module DataClient
|
||||
( HttpError(..)
|
||||
, PostId (..)
|
||||
, get
|
||||
, getSiteBoards
|
||||
, getAllSites
|
||||
|
@ -10,8 +12,10 @@ module DataClient
|
|||
, postBoards
|
||||
, getThreads
|
||||
, postThreads
|
||||
, postPosts
|
||||
) where
|
||||
|
||||
import Data.Int (Int64)
|
||||
import Network.HTTP.Simple
|
||||
import qualified Data.ByteString.Lazy as LBS
|
||||
import qualified Data.ByteString.Lazy.Char8 as LC8
|
||||
|
@ -27,17 +31,24 @@ import Data.Aeson
|
|||
, encode
|
||||
, Value
|
||||
)
|
||||
import GHC.Generics
|
||||
|
||||
import qualified Types as T
|
||||
import qualified JSONSettings as T
|
||||
import qualified SitesType as Sites
|
||||
import qualified BoardsType as Boards
|
||||
import qualified ThreadType as Threads
|
||||
import qualified PostsType as Posts
|
||||
|
||||
data HttpError
|
||||
= HttpException SomeException
|
||||
| StatusCodeError Int LBS.ByteString
|
||||
deriving (Show)
|
||||
|
||||
data PostId = PostId
|
||||
{ post_id :: Int64
|
||||
, board_post_id :: Int64
|
||||
, thread_id :: Int64
|
||||
} deriving (Show, Generic, FromJSON)
|
||||
|
||||
get :: T.JSONSettings -> String -> IO (Either HttpError LBS.ByteString)
|
||||
get settings path = do
|
||||
|
@ -151,6 +162,17 @@ getThreads settings board_id board_thread_ids =
|
|||
path = "/threads?board_thread_id=in.(" ++ ids ++ ")&board_id=eq." ++ show board_id
|
||||
ids :: String = intercalate "," $ map show board_thread_ids
|
||||
|
||||
postPosts
|
||||
:: T.JSONSettings
|
||||
-> [ Posts.Post ]
|
||||
-> IO (Either HttpError [ PostId ])
|
||||
postPosts settings posts =
|
||||
post settings "/rpc/insert_posts_and_return_ids" payload True >>= return . eitherDecodeResponse
|
||||
|
||||
where
|
||||
payload = encode $ object [ "new_posts" .= posts ]
|
||||
|
||||
|
||||
eitherDecodeResponse :: (FromJSON a) => Either HttpError LBS.ByteString -> Either HttpError a
|
||||
eitherDecodeResponse (Left err) = Left err
|
||||
eitherDecodeResponse (Right bs) =
|
||||
|
|
|
@ -1,14 +1,29 @@
|
|||
module JSONCommonTypes
|
||||
( File (..)
|
||||
, Cyclical (..)
|
||||
) where
|
||||
|
||||
import Data.Text (Text)
|
||||
import Data.Aeson (FromJSON)
|
||||
import qualified Data.Text as T
|
||||
import Data.Aeson
|
||||
import Data.Aeson.Types (typeMismatch)
|
||||
import GHC.Generics
|
||||
|
||||
data Cyclical = Cyclical Int deriving (Show, Generic)
|
||||
|
||||
instance FromJSON Cyclical where
|
||||
parseJSON (Number n) = return $ Cyclical (floor n)
|
||||
parseJSON (String s) =
|
||||
case reads (T.unpack s) :: [(Int, String)] of
|
||||
[(n, "")] -> return $ Cyclical n
|
||||
_ -> typeMismatch "Int or String containing Int" (String s)
|
||||
|
||||
parseJSON invalid = typeMismatch "Int or String" invalid
|
||||
|
||||
|
||||
data File = File
|
||||
{ id :: Text
|
||||
, mime :: Text
|
||||
, mime :: Maybe Text
|
||||
, ext :: Text
|
||||
, h :: Maybe Int
|
||||
, w :: Maybe Int
|
||||
|
|
|
@ -9,26 +9,13 @@ module JSONParsing
|
|||
{-# LANGUAGE OverloadedStrings #-}
|
||||
|
||||
import Data.Text (Text)
|
||||
import Data.Aeson
|
||||
import GHC.Generics
|
||||
import qualified Data.ByteString.Lazy as B
|
||||
import qualified Data.Text as T
|
||||
import Data.Aeson.Types (typeMismatch)
|
||||
import Data.Aeson
|
||||
|
||||
import qualified JSONPost as Post
|
||||
import qualified JSONCommonTypes as J
|
||||
|
||||
data Cyclical = Cyclical Int deriving (Show, Generic)
|
||||
|
||||
instance FromJSON Cyclical where
|
||||
parseJSON (Number n) = return $ Cyclical (floor n)
|
||||
parseJSON (String s) =
|
||||
case reads (T.unpack s) :: [(Int, String)] of
|
||||
[(n, "")] -> return $ Cyclical n
|
||||
_ -> typeMismatch "Int or String containing Int" (String s)
|
||||
|
||||
parseJSON invalid = typeMismatch "Int or String" invalid
|
||||
|
||||
data Thread = Thread
|
||||
{ no :: Int
|
||||
, sub :: Maybe Text
|
||||
|
@ -42,7 +29,7 @@ data Thread = Thread
|
|||
, images :: Maybe Int
|
||||
, sticky :: Maybe Int
|
||||
, locked :: Maybe Int
|
||||
, cyclical :: Maybe Cyclical
|
||||
, cyclical :: Maybe J.Cyclical
|
||||
, last_modified :: Int
|
||||
, board :: Text
|
||||
, files :: Maybe [J.File]
|
||||
|
|
|
@ -4,12 +4,13 @@ module JSONPost
|
|||
) where
|
||||
|
||||
import Data.Text (Text)
|
||||
import Data.Int (Int64)
|
||||
import Data.Aeson (FromJSON)
|
||||
import GHC.Generics
|
||||
import qualified JSONCommonTypes as J
|
||||
|
||||
data Post = Post
|
||||
{ no :: Int
|
||||
{ no :: Int64
|
||||
, com :: Maybe Text
|
||||
, name :: Maybe Text
|
||||
, time :: Int
|
||||
|
@ -17,7 +18,7 @@ data Post = Post
|
|||
, omitted_images :: Maybe Int
|
||||
, sticky :: Maybe Int
|
||||
, locked :: Maybe Int
|
||||
, cyclical :: Maybe Int
|
||||
, cyclical :: Maybe J.Cyclical
|
||||
, last_modified :: Int
|
||||
, board :: String
|
||||
, files :: Maybe [J.File]
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
module Types
|
||||
module JSONSettings
|
||||
( JSONSettings(..)
|
||||
) where
|
||||
|
|
@ -0,0 +1,19 @@
|
|||
{-# LANGUAGE DeriveAnyClass #-}
|
||||
|
||||
module PostsType
|
||||
( Post (..) )
|
||||
where
|
||||
|
||||
import GHC.Generics
|
||||
import Data.Aeson (FromJSON, ToJSON)
|
||||
import Data.Time.Clock (UTCTime) -- Required for timestamp with time zone
|
||||
import Data.Int (Int64)
|
||||
import Data.Text (Text)
|
||||
|
||||
data Post = Post
|
||||
{ post_id :: Maybe Int64
|
||||
, board_post_id :: Int64
|
||||
, creation_time :: UTCTime
|
||||
, body :: Maybe Text
|
||||
, thread_id :: Int
|
||||
} deriving (Show, Generic, FromJSON, ToJSON)
|
Loading…
Reference in New Issue