Compare commits

...

2 Commits

10 changed files with 312 additions and 67 deletions

View File

@ -68,10 +68,13 @@ executable chan-delorean
other-modules: other-modules:
JSONParsing JSONParsing
DataClient DataClient
Types JSONSettings
SitesType SitesType
BoardsType BoardsType
ThreadType ThreadType
JSONPost
JSONCommonTypes
PostsType
-- LANGUAGE extensions used by modules in this package. -- LANGUAGE extensions used by modules in this package.
-- other-extensions: -- other-extensions:

View File

@ -9,7 +9,7 @@ let
}: }:
mkDerivation { mkDerivation {
pname = "chan-delorean"; pname = "chan-delorean";
version = "0.0.0.0"; version = "0.0.2";
src = ./.; src = ./.;
isLibrary = false; isLibrary = false;
isExecutable = true; isExecutable = true;

View File

@ -110,6 +110,69 @@ CREATE INDEX attachments_post_id_idx ON attachments (post_id);
CREATE INDEX attachments_md5_hash_idx ON attachments (md5_hash); CREATE INDEX attachments_md5_hash_idx ON attachments (md5_hash);
CREATE INDEX attachments_phash_bktree_index ON attachments USING spgist (phash bktree_ops); CREATE INDEX attachments_phash_bktree_index ON attachments USING spgist (phash bktree_ops);
/*
* Function Definitions
*/
/*
CREATE OR REPLACE FUNCTION insert_posts_and_return_ids(new_posts posts[])
RETURNS TABLE (post_id bigint, board_post_id bigint) AS $$
WITH inserted AS (
INSERT INTO posts (board_post_id, creation_time, body, thread_id)
SELECT np.board_post_id, np.creation_time, np.body, np.thread_id
FROM unnest(new_posts) AS np
ON CONFLICT (thread_id, board_post_id) DO NOTHING
RETURNING post_id, board_post_id
),
selected AS (
SELECT post_id, board_post_id
FROM posts
WHERE (thread_id, board_post_id) IN (SELECT thread_id, board_post_id FROM unnest(new_posts))
)
SELECT * FROM inserted
UNION ALL
SELECT * FROM selected WHERE (post_id, board_post_id) NOT IN (SELECT post_id, board_post_id FROM inserted);
$$ LANGUAGE sql;
-- 3m37s for clean db
-- 1m34s for full db (nothing inserted)
*/
CREATE OR REPLACE FUNCTION insert_posts_and_return_ids(new_posts posts[])
RETURNS TABLE (post_id bigint, board_post_id bigint, thread_id bigint) AS $$
WITH
selected AS (
SELECT post_id, board_post_id, thread_id
FROM posts
WHERE (thread_id, board_post_id) IN (SELECT thread_id, board_post_id FROM unnest(new_posts))
),
to_insert AS (
SELECT np.*
FROM unnest(new_posts) AS np
LEFT OUTER JOIN selected s ON np.thread_id = s.thread_id AND np.board_post_id = s.board_post_id
WHERE s.post_id IS NULL
),
inserted AS (
INSERT INTO posts (board_post_id, creation_time, body, thread_id)
SELECT board_post_id, creation_time, body, thread_id
FROM to_insert
RETURNING post_id, board_post_id, thread_id
)
SELECT * FROM inserted
UNION ALL
SELECT * FROM selected;
$$ LANGUAGE sql;
-- 1:51 for clean db (this varies a lot)
-- 1:21 for full db (nothing inserted)
/*
* Permissions
*/
CREATE ROLE chan_archive_anon nologin; CREATE ROLE chan_archive_anon nologin;
GRANT CONNECT ON DATABASE chan_archives TO chan_archive_anon; GRANT CONNECT ON DATABASE chan_archives TO chan_archive_anon;
GRANT SELECT ON sites TO chan_archive_anon; GRANT SELECT ON sites TO chan_archive_anon;
@ -130,9 +193,12 @@ GRANT ALL ON threads TO chan_archiver;
GRANT ALL ON posts TO chan_archiver; GRANT ALL ON posts TO chan_archiver;
GRANT ALL ON attachments TO chan_archiver; GRANT ALL ON attachments TO chan_archiver;
GRANT EXECUTE ON FUNCTION update_post_body_search_index TO chan_archiver; GRANT EXECUTE ON FUNCTION update_post_body_search_index TO chan_archiver;
GRANT usage, select ON SEQUENCE sites_site_id_seq TO chan_archiver; GRANT EXECUTE ON FUNCTION insert_posts_and_return_ids TO chan_archiver;
GRANT usage, select ON SEQUENCE boards_board_id_seq TO chan_archiver; GRANT usage, select ON SEQUENCE sites_site_id_seq TO chan_archiver;
GRANT usage, select ON SEQUENCE threads_thread_id_seq TO chan_archiver; GRANT usage, select ON SEQUENCE boards_board_id_seq TO chan_archiver;
GRANT usage, select ON SEQUENCE threads_thread_id_seq TO chan_archiver;
GRANT usage, select ON SEQUENCE posts_post_id_seq TO chan_archiver;
GRANT chan_archiver TO admin; GRANT chan_archiver TO admin;
COMMIT; COMMIT;

View File

@ -1,7 +1,3 @@
-- {-# LANGUAGE DeriveDataTypeable #-}
-- {-# LANGUAGE DeriveGeneric #-}
-- {-# LANGUAGE OverloadedStrings #-}
module Main where module Main where
import System.Exit import System.Exit
@ -16,13 +12,14 @@ import qualified Data.Set as Set
import Data.Time.Clock.POSIX (posixSecondsToUTCTime) import Data.Time.Clock.POSIX (posixSecondsToUTCTime)
import Data.Time.Clock (UTCTime) import Data.Time.Clock (UTCTime)
import JSONParsing import JSONParsing
import Types import JSONSettings
import qualified JSONPost as JSONPosts
import qualified DataClient as Client import qualified DataClient as Client
import qualified SitesType as Sites import qualified SitesType as Sites
import qualified BoardsType as Boards import qualified BoardsType as Boards
import qualified ThreadType as Threads import qualified ThreadType as Threads
import qualified PostsType as Posts
data SettingsCLI = SettingsCLI data SettingsCLI = SettingsCLI
{ jsonFile :: FilePath { jsonFile :: FilePath
@ -31,7 +28,7 @@ data SettingsCLI = SettingsCLI
settingsCLI :: SettingsCLI settingsCLI :: SettingsCLI
settingsCLI = SettingsCLI settingsCLI = SettingsCLI
{ jsonFile = def &= args &= typ "settings-jsonfile-path" { jsonFile = def &= args &= typ "settings-jsonfile-path"
} &= summary "Backfill v0.0.1" } &= summary "Backfill v0.0.2"
listCatalogDirectories :: JSONSettings -> IO [FilePath] listCatalogDirectories :: JSONSettings -> IO [FilePath]
@ -164,21 +161,107 @@ ensureThreads settings board all_threads = do
return $ archived_threads ++ new_threads return $ archived_threads ++ new_threads
readPosts
:: JSONSettings
-> Boards.Board
-> Threads.Thread
-> IO (Threads.Thread, [ JSONPosts.Post ])
readPosts settings board thread = do
result <- parsePosts thread_filename
case result of
Left err -> do
putStrLn $ "Failed to parse the JSON file " ++ thread_filename ++ " error: " ++ err
exitFailure
Right posts_wrapper -> return $ (thread, JSONPosts.posts posts_wrapper)
where
thread_filename :: FilePath
thread_filename = backupDir </> "res" </> ((show $ Threads.board_thread_id thread) ++ ".json")
backupDir :: FilePath
backupDir = backup_read_root settings </> (Boards.pathpart board)
ensurePosts
:: JSONSettings
-> Boards.Board
-> [(Threads.Thread, [ Posts.Post ])]
-> IO [(Threads.Thread, [ Posts.Post ])]
ensurePosts = undefined
-- Convert Post to DbPost
apiPostToArchivePost :: Threads.Thread -> JSONPosts.Post -> Posts.Post
apiPostToArchivePost thread post =
Posts.Post
{ Posts.post_id = Nothing
, Posts.board_post_id = JSONPosts.no post
, Posts.creation_time = posixSecondsToUTCTime (realToFrac $ JSONPosts.time post)
, Posts.body = JSONPosts.com post
, Posts.thread_id = Threads.thread_id thread
}
-- | A version of 'concatMap' that works with a monadic predicate.
-- Stolen from package extra Control.Monad.Extra
concatMapM :: Monad m => (a -> m [b]) -> [a] -> m [b]
{-# INLINE concatMapM #-}
concatMapM op = foldr f (pure [])
where f x xs = do
x_ <- op x
if null x_
then xs
else do
xs_ <- xs
pure $ x_ ++ xs_
processBoard :: JSONSettings -> Boards.Board -> IO () processBoard :: JSONSettings -> Boards.Board -> IO ()
processBoard settings board = do processBoard settings board = do
let catalogPath = backupDir </> (Boards.pathpart board) </> "catalog.json" let catalogPath = backupDir </> "catalog.json"
putStrLn $ "catalog file path: " ++ catalogPath putStrLn $ "catalog file path: " ++ catalogPath
result <- parseJSONFile catalogPath result <- parseJSONCatalog catalogPath
case result of case result of
Right catalogs -> do Right catalogs -> do
let threads_on_board = concatMap threads catalogs let threads_on_board = concatMap threads catalogs
new_threads <- ensureThreads settings board threads_on_board all_threads_for_board :: [ Threads.Thread ] <- ensureThreads settings board threads_on_board
-- catalogs can be turned into [ Thread ]
-- ensureThreads :: ( Board, [ Thread ] ) -> IO () all_posts_on_board :: [(Threads.Thread, [ JSONPosts.Post ])] <- mapM (readPosts settings board) all_threads_for_board
-- mapM_ (print . no) threads_on_board
-- putStrLn $ "Number of posts on /" ++ (Boards.pathpart board) ++ "/ " ++ (show $ length all_posts_on_board)
posts_result <- Client.postPosts settings (concatMap (\(t, posts) -> map (apiPostToArchivePost t) posts) all_posts_on_board)
-- TODO: why doesn't it insert posts for threads that already exist? we can have new posts!
case posts_result of
Left err -> print err
Right new_ids -> do
putStrLn "Sum of post_ids:"
print $ sum $ map Client.post_id new_ids
putStrLn "Sum of board_post_ids:"
print $ sum $ map Client.board_post_id new_ids
-- max: 18,645
-- min: 147
-- total: 191,628
--
-- f :: Threads.Thread -> [ Posts.Post ]
-- for each thread we have to call a function that
-- - reads the thread under the board directory:
-- - t = backupDir </> "res' </> ((show $ no thread) ++ ".json")
--
-- do we want an ensurethreads?
-- - then for each thread, grab the posts from json and see if they exist
-- - this might have to be done 350 times per board
--
-- So we need a function (Threads.Thread, [ Posts.Post ]) -> ??? [ new Post type? ]
-- - why?
-- - well because the new post type will have a thread_id, which is known to be unique
-- - so we need to query the db for this same (thread_id (from Thread), no (from Post))
return () return ()
Left errMsg -> Left errMsg ->
putStrLn $ "Failed to parse the JSON file in directory: " putStrLn $ "Failed to parse the JSON file in directory: "
@ -186,7 +269,7 @@ processBoard settings board = do
where where
backupDir :: FilePath backupDir :: FilePath
backupDir = backup_read_root settings backupDir = backup_read_root settings </> (Boards.pathpart board)
processBackupDirectory :: JSONSettings -> IO () processBackupDirectory :: JSONSettings -> IO ()
@ -208,6 +291,7 @@ processBackupDirectory settings = do
let boards :: [ Boards.Board ] = archived_boards ++ created_boards let boards :: [ Boards.Board ] = archived_boards ++ created_boards
mapM_ (processBoard settings) boards mapM_ (processBoard settings) boards
main :: IO () main :: IO ()
main = do main = do
settingsValue <- cmdArgs settingsCLI settingsValue <- cmdArgs settingsCLI

View File

@ -1,7 +1,9 @@
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE DeriveAnyClass #-}
module DataClient module DataClient
( HttpError(..) ( HttpError(..)
, PostId (..)
, get , get
, getSiteBoards , getSiteBoards
, getAllSites , getAllSites
@ -10,8 +12,10 @@ module DataClient
, postBoards , postBoards
, getThreads , getThreads
, postThreads , postThreads
, postPosts
) where ) where
import Data.Int (Int64)
import Network.HTTP.Simple import Network.HTTP.Simple
import qualified Data.ByteString.Lazy as LBS import qualified Data.ByteString.Lazy as LBS
import qualified Data.ByteString.Lazy.Char8 as LC8 import qualified Data.ByteString.Lazy.Char8 as LC8
@ -27,17 +31,24 @@ import Data.Aeson
, encode , encode
, Value , Value
) )
import GHC.Generics
import qualified Types as T import qualified JSONSettings as T
import qualified SitesType as Sites import qualified SitesType as Sites
import qualified BoardsType as Boards import qualified BoardsType as Boards
import qualified ThreadType as Threads import qualified ThreadType as Threads
import qualified PostsType as Posts
data HttpError data HttpError
= HttpException SomeException = HttpException SomeException
| StatusCodeError Int LBS.ByteString | StatusCodeError Int LBS.ByteString
deriving (Show) deriving (Show)
data PostId = PostId
{ post_id :: Int64
, board_post_id :: Int64
, thread_id :: Int64
} deriving (Show, Generic, FromJSON)
get :: T.JSONSettings -> String -> IO (Either HttpError LBS.ByteString) get :: T.JSONSettings -> String -> IO (Either HttpError LBS.ByteString)
get settings path = do get settings path = do
@ -65,6 +76,7 @@ post settings path payload return_repr = do
$ initReq $ initReq
putStrLn $ "posting to " ++ requestUrl putStrLn $ "posting to " ++ requestUrl
-- putStrLn $ "Payload: " ++ (LC8.unpack payload)
handleHttp (httpLBS req) handleHttp (httpLBS req)
where where
@ -150,6 +162,17 @@ getThreads settings board_id board_thread_ids =
path = "/threads?board_thread_id=in.(" ++ ids ++ ")&board_id=eq." ++ show board_id path = "/threads?board_thread_id=in.(" ++ ids ++ ")&board_id=eq." ++ show board_id
ids :: String = intercalate "," $ map show board_thread_ids ids :: String = intercalate "," $ map show board_thread_ids
postPosts
:: T.JSONSettings
-> [ Posts.Post ]
-> IO (Either HttpError [ PostId ])
postPosts settings posts =
post settings "/rpc/insert_posts_and_return_ids" payload True >>= return . eitherDecodeResponse
where
payload = encode $ object [ "new_posts" .= posts ]
eitherDecodeResponse :: (FromJSON a) => Either HttpError LBS.ByteString -> Either HttpError a eitherDecodeResponse :: (FromJSON a) => Either HttpError LBS.ByteString -> Either HttpError a
eitherDecodeResponse (Left err) = Left err eitherDecodeResponse (Left err) = Left err
eitherDecodeResponse (Right bs) = eitherDecodeResponse (Right bs) =

39
src/JSONCommonTypes.hs Normal file
View File

@ -0,0 +1,39 @@
module JSONCommonTypes
( File (..)
, Cyclical (..)
) where
import Data.Text (Text)
import qualified Data.Text as T
import Data.Aeson
import Data.Aeson.Types (typeMismatch)
import GHC.Generics
data Cyclical = Cyclical Int deriving (Show, Generic)
instance FromJSON Cyclical where
parseJSON (Number n) = return $ Cyclical (floor n)
parseJSON (String s) =
case reads (T.unpack s) :: [(Int, String)] of
[(n, "")] -> return $ Cyclical n
_ -> typeMismatch "Int or String containing Int" (String s)
parseJSON invalid = typeMismatch "Int or String" invalid
data File = File
{ id :: Text
, mime :: Maybe Text
, ext :: Text
, h :: Maybe Int
, w :: Maybe Int
, fsize :: Int
, filename :: Text
, spoiler :: Maybe Bool
, md5 :: Text
, file_path :: Text
, thumb_path :: Text
} deriving (Show, Generic)
instance FromJSON File
--instance ToJSON File

View File

@ -1,36 +1,27 @@
module JSONParsing module JSONParsing
( Thread(..) ( Thread (..)
, File(..) , Catalog (..)
, Catalog(..) , parseJSONCatalog
, parseJSONFile , parsePosts
) where ) where
{-# LANGUAGE DeriveGeneric #-} {-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
import Data.Aeson import Data.Text (Text)
import GHC.Generics import GHC.Generics
import qualified Data.ByteString.Lazy as B import qualified Data.ByteString.Lazy as B
import qualified Data.Text as T import Data.Aeson
import Data.Aeson.Types (typeMismatch)
data Cyclical = Cyclical Int deriving (Show, Generic) import qualified JSONPost as Post
import qualified JSONCommonTypes as J
instance FromJSON Cyclical where
parseJSON (Number n) = return $ Cyclical (floor n)
parseJSON (String s) =
case reads (T.unpack s) :: [(Int, String)] of
[(n, "")] -> return $ Cyclical n
_ -> typeMismatch "Int or String containing Int" (String s)
parseJSON invalid = typeMismatch "Int or String" invalid
data Thread = Thread data Thread = Thread
{ no :: Int { no :: Int
, sub :: Maybe String , sub :: Maybe Text
, com :: Maybe String , com :: Maybe Text
, name :: Maybe String , name :: Maybe Text
, capcode :: Maybe String , capcode :: Maybe Text
, time :: Int , time :: Int
, omitted_posts :: Maybe Int , omitted_posts :: Maybe Int
, omitted_images:: Maybe Int , omitted_images:: Maybe Int
@ -38,42 +29,27 @@ data Thread = Thread
, images :: Maybe Int , images :: Maybe Int
, sticky :: Maybe Int , sticky :: Maybe Int
, locked :: Maybe Int , locked :: Maybe Int
, cyclical :: Maybe Cyclical , cyclical :: Maybe J.Cyclical
, last_modified :: Int , last_modified :: Int
, board :: String , board :: Text
, files :: Maybe [File] , files :: Maybe [J.File]
, resto :: Int , resto :: Int
, unique_ips :: Maybe Int , unique_ips :: Maybe Int
} deriving (Show, Generic) } deriving (Show, Generic)
data File = File instance FromJSON Thread
{ id :: String --instance ToJSON Thread
, mime :: String
, ext :: String
, h :: Maybe Int
, w :: Maybe Int
, fsize :: Int
, filename :: String
, spoiler :: Maybe Bool
, md5 :: String
, file_path :: String
, thumb_path :: String
} deriving (Show, Generic)
data Catalog = Catalog data Catalog = Catalog
{ threads :: [Thread] { threads :: [Thread]
, page :: Int , page :: Int
} deriving (Show, Generic) } deriving (Show, Generic)
instance FromJSON Thread
--instance ToJSON Thread
instance FromJSON File
--instance ToJSON File
instance FromJSON Catalog instance FromJSON Catalog
--instance ToJSON Catalog --instance ToJSON Catalog
parseJSONCatalog :: FilePath -> IO (Either String [Catalog])
parseJSONCatalog path = B.readFile path >>= return . eitherDecode
parseJSONFile :: FilePath -> IO (Either String [Catalog]) parsePosts :: FilePath -> IO (Either String Post.PostWrapper)
parseJSONFile path = do parsePosts path = B.readFile path >>= return . eitherDecode
jsonData <- B.readFile path
return $ eitherDecode jsonData

35
src/JSONPost.hs Normal file
View File

@ -0,0 +1,35 @@
module JSONPost
( Post (..)
, PostWrapper (..)
) where
import Data.Text (Text)
import Data.Int (Int64)
import Data.Aeson (FromJSON)
import GHC.Generics
import qualified JSONCommonTypes as J
data Post = Post
{ no :: Int64
, com :: Maybe Text
, name :: Maybe Text
, time :: Int
, omitted_posts :: Maybe Int
, omitted_images :: Maybe Int
, sticky :: Maybe Int
, locked :: Maybe Int
, cyclical :: Maybe J.Cyclical
, last_modified :: Int
, board :: String
, files :: Maybe [J.File]
, resto :: Int
, unique_ips :: Maybe Int
} deriving (Show, Generic)
instance FromJSON Post
data PostWrapper = PostWrapper
{ posts :: [Post]
} deriving (Show, Generic)
instance FromJSON PostWrapper

View File

@ -1,4 +1,4 @@
module Types module JSONSettings
( JSONSettings(..) ( JSONSettings(..)
) where ) where

19
src/PostsType.hs Normal file
View File

@ -0,0 +1,19 @@
{-# LANGUAGE DeriveAnyClass #-}
module PostsType
( Post (..) )
where
import GHC.Generics
import Data.Aeson (FromJSON, ToJSON)
import Data.Time.Clock (UTCTime) -- Required for timestamp with time zone
import Data.Int (Int64)
import Data.Text (Text)
data Post = Post
{ post_id :: Maybe Int64
, board_post_id :: Int64
, creation_time :: UTCTime
, body :: Maybe Text
, thread_id :: Int
} deriving (Show, Generic, FromJSON, ToJSON)