Compare commits

...

2 Commits

10 changed files with 312 additions and 67 deletions

View File

@ -68,10 +68,13 @@ executable chan-delorean
other-modules:
JSONParsing
DataClient
Types
JSONSettings
SitesType
BoardsType
ThreadType
JSONPost
JSONCommonTypes
PostsType
-- LANGUAGE extensions used by modules in this package.
-- other-extensions:

View File

@ -9,7 +9,7 @@ let
}:
mkDerivation {
pname = "chan-delorean";
version = "0.0.0.0";
version = "0.0.2";
src = ./.;
isLibrary = false;
isExecutable = true;

View File

@ -110,6 +110,69 @@ CREATE INDEX attachments_post_id_idx ON attachments (post_id);
CREATE INDEX attachments_md5_hash_idx ON attachments (md5_hash);
CREATE INDEX attachments_phash_bktree_index ON attachments USING spgist (phash bktree_ops);
/*
* Function Definitions
*/
/*
CREATE OR REPLACE FUNCTION insert_posts_and_return_ids(new_posts posts[])
RETURNS TABLE (post_id bigint, board_post_id bigint) AS $$
WITH inserted AS (
INSERT INTO posts (board_post_id, creation_time, body, thread_id)
SELECT np.board_post_id, np.creation_time, np.body, np.thread_id
FROM unnest(new_posts) AS np
ON CONFLICT (thread_id, board_post_id) DO NOTHING
RETURNING post_id, board_post_id
),
selected AS (
SELECT post_id, board_post_id
FROM posts
WHERE (thread_id, board_post_id) IN (SELECT thread_id, board_post_id FROM unnest(new_posts))
)
SELECT * FROM inserted
UNION ALL
SELECT * FROM selected WHERE (post_id, board_post_id) NOT IN (SELECT post_id, board_post_id FROM inserted);
$$ LANGUAGE sql;
-- 3m37s for clean db
-- 1m34s for full db (nothing inserted)
*/
CREATE OR REPLACE FUNCTION insert_posts_and_return_ids(new_posts posts[])
RETURNS TABLE (post_id bigint, board_post_id bigint, thread_id bigint) AS $$
WITH
selected AS (
SELECT post_id, board_post_id, thread_id
FROM posts
WHERE (thread_id, board_post_id) IN (SELECT thread_id, board_post_id FROM unnest(new_posts))
),
to_insert AS (
SELECT np.*
FROM unnest(new_posts) AS np
LEFT OUTER JOIN selected s ON np.thread_id = s.thread_id AND np.board_post_id = s.board_post_id
WHERE s.post_id IS NULL
),
inserted AS (
INSERT INTO posts (board_post_id, creation_time, body, thread_id)
SELECT board_post_id, creation_time, body, thread_id
FROM to_insert
RETURNING post_id, board_post_id, thread_id
)
SELECT * FROM inserted
UNION ALL
SELECT * FROM selected;
$$ LANGUAGE sql;
-- 1:51 for clean db (this varies a lot)
-- 1:21 for full db (nothing inserted)
/*
* Permissions
*/
CREATE ROLE chan_archive_anon nologin;
GRANT CONNECT ON DATABASE chan_archives TO chan_archive_anon;
GRANT SELECT ON sites TO chan_archive_anon;
@ -130,9 +193,12 @@ GRANT ALL ON threads TO chan_archiver;
GRANT ALL ON posts TO chan_archiver;
GRANT ALL ON attachments TO chan_archiver;
GRANT EXECUTE ON FUNCTION update_post_body_search_index TO chan_archiver;
GRANT usage, select ON SEQUENCE sites_site_id_seq TO chan_archiver;
GRANT usage, select ON SEQUENCE boards_board_id_seq TO chan_archiver;
GRANT usage, select ON SEQUENCE threads_thread_id_seq TO chan_archiver;
GRANT EXECUTE ON FUNCTION insert_posts_and_return_ids TO chan_archiver;
GRANT usage, select ON SEQUENCE sites_site_id_seq TO chan_archiver;
GRANT usage, select ON SEQUENCE boards_board_id_seq TO chan_archiver;
GRANT usage, select ON SEQUENCE threads_thread_id_seq TO chan_archiver;
GRANT usage, select ON SEQUENCE posts_post_id_seq TO chan_archiver;
GRANT chan_archiver TO admin;
COMMIT;

View File

@ -1,7 +1,3 @@
-- {-# LANGUAGE DeriveDataTypeable #-}
-- {-# LANGUAGE DeriveGeneric #-}
-- {-# LANGUAGE OverloadedStrings #-}
module Main where
import System.Exit
@ -16,13 +12,14 @@ import qualified Data.Set as Set
import Data.Time.Clock.POSIX (posixSecondsToUTCTime)
import Data.Time.Clock (UTCTime)
import JSONParsing
import Types
import JSONSettings
import qualified JSONPost as JSONPosts
import qualified DataClient as Client
import qualified SitesType as Sites
import qualified SitesType as Sites
import qualified BoardsType as Boards
import qualified ThreadType as Threads
import qualified PostsType as Posts
data SettingsCLI = SettingsCLI
{ jsonFile :: FilePath
@ -31,7 +28,7 @@ data SettingsCLI = SettingsCLI
settingsCLI :: SettingsCLI
settingsCLI = SettingsCLI
{ jsonFile = def &= args &= typ "settings-jsonfile-path"
} &= summary "Backfill v0.0.1"
} &= summary "Backfill v0.0.2"
listCatalogDirectories :: JSONSettings -> IO [FilePath]
@ -164,21 +161,107 @@ ensureThreads settings board all_threads = do
return $ archived_threads ++ new_threads
readPosts
:: JSONSettings
-> Boards.Board
-> Threads.Thread
-> IO (Threads.Thread, [ JSONPosts.Post ])
readPosts settings board thread = do
result <- parsePosts thread_filename
case result of
Left err -> do
putStrLn $ "Failed to parse the JSON file " ++ thread_filename ++ " error: " ++ err
exitFailure
Right posts_wrapper -> return $ (thread, JSONPosts.posts posts_wrapper)
where
thread_filename :: FilePath
thread_filename = backupDir </> "res" </> ((show $ Threads.board_thread_id thread) ++ ".json")
backupDir :: FilePath
backupDir = backup_read_root settings </> (Boards.pathpart board)
ensurePosts
:: JSONSettings
-> Boards.Board
-> [(Threads.Thread, [ Posts.Post ])]
-> IO [(Threads.Thread, [ Posts.Post ])]
ensurePosts = undefined
-- Convert Post to DbPost
apiPostToArchivePost :: Threads.Thread -> JSONPosts.Post -> Posts.Post
apiPostToArchivePost thread post =
Posts.Post
{ Posts.post_id = Nothing
, Posts.board_post_id = JSONPosts.no post
, Posts.creation_time = posixSecondsToUTCTime (realToFrac $ JSONPosts.time post)
, Posts.body = JSONPosts.com post
, Posts.thread_id = Threads.thread_id thread
}
-- | A version of 'concatMap' that works with a monadic predicate.
-- Stolen from package extra Control.Monad.Extra
concatMapM :: Monad m => (a -> m [b]) -> [a] -> m [b]
{-# INLINE concatMapM #-}
concatMapM op = foldr f (pure [])
where f x xs = do
x_ <- op x
if null x_
then xs
else do
xs_ <- xs
pure $ x_ ++ xs_
processBoard :: JSONSettings -> Boards.Board -> IO ()
processBoard settings board = do
let catalogPath = backupDir </> (Boards.pathpart board) </> "catalog.json"
let catalogPath = backupDir </> "catalog.json"
putStrLn $ "catalog file path: " ++ catalogPath
result <- parseJSONFile catalogPath
result <- parseJSONCatalog catalogPath
case result of
Right catalogs -> do
let threads_on_board = concatMap threads catalogs
new_threads <- ensureThreads settings board threads_on_board
-- catalogs can be turned into [ Thread ]
-- ensureThreads :: ( Board, [ Thread ] ) -> IO ()
-- mapM_ (print . no) threads_on_board
all_threads_for_board :: [ Threads.Thread ] <- ensureThreads settings board threads_on_board
all_posts_on_board :: [(Threads.Thread, [ JSONPosts.Post ])] <- mapM (readPosts settings board) all_threads_for_board
-- putStrLn $ "Number of posts on /" ++ (Boards.pathpart board) ++ "/ " ++ (show $ length all_posts_on_board)
posts_result <- Client.postPosts settings (concatMap (\(t, posts) -> map (apiPostToArchivePost t) posts) all_posts_on_board)
-- TODO: why doesn't it insert posts for threads that already exist? we can have new posts!
case posts_result of
Left err -> print err
Right new_ids -> do
putStrLn "Sum of post_ids:"
print $ sum $ map Client.post_id new_ids
putStrLn "Sum of board_post_ids:"
print $ sum $ map Client.board_post_id new_ids
-- max: 18,645
-- min: 147
-- total: 191,628
--
-- f :: Threads.Thread -> [ Posts.Post ]
-- for each thread we have to call a function that
-- - reads the thread under the board directory:
-- - t = backupDir </> "res' </> ((show $ no thread) ++ ".json")
--
-- do we want an ensurethreads?
-- - then for each thread, grab the posts from json and see if they exist
-- - this might have to be done 350 times per board
--
-- So we need a function (Threads.Thread, [ Posts.Post ]) -> ??? [ new Post type? ]
-- - why?
-- - well because the new post type will have a thread_id, which is known to be unique
-- - so we need to query the db for this same (thread_id (from Thread), no (from Post))
return ()
Left errMsg ->
putStrLn $ "Failed to parse the JSON file in directory: "
@ -186,7 +269,7 @@ processBoard settings board = do
where
backupDir :: FilePath
backupDir = backup_read_root settings
backupDir = backup_read_root settings </> (Boards.pathpart board)
processBackupDirectory :: JSONSettings -> IO ()
@ -208,6 +291,7 @@ processBackupDirectory settings = do
let boards :: [ Boards.Board ] = archived_boards ++ created_boards
mapM_ (processBoard settings) boards
main :: IO ()
main = do
settingsValue <- cmdArgs settingsCLI

View File

@ -1,7 +1,9 @@
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE DeriveAnyClass #-}
module DataClient
( HttpError(..)
, PostId (..)
, get
, getSiteBoards
, getAllSites
@ -10,8 +12,10 @@ module DataClient
, postBoards
, getThreads
, postThreads
, postPosts
) where
import Data.Int (Int64)
import Network.HTTP.Simple
import qualified Data.ByteString.Lazy as LBS
import qualified Data.ByteString.Lazy.Char8 as LC8
@ -27,17 +31,24 @@ import Data.Aeson
, encode
, Value
)
import GHC.Generics
import qualified Types as T
import qualified JSONSettings as T
import qualified SitesType as Sites
import qualified BoardsType as Boards
import qualified ThreadType as Threads
import qualified PostsType as Posts
data HttpError
= HttpException SomeException
| StatusCodeError Int LBS.ByteString
deriving (Show)
data PostId = PostId
{ post_id :: Int64
, board_post_id :: Int64
, thread_id :: Int64
} deriving (Show, Generic, FromJSON)
get :: T.JSONSettings -> String -> IO (Either HttpError LBS.ByteString)
get settings path = do
@ -65,6 +76,7 @@ post settings path payload return_repr = do
$ initReq
putStrLn $ "posting to " ++ requestUrl
-- putStrLn $ "Payload: " ++ (LC8.unpack payload)
handleHttp (httpLBS req)
where
@ -150,6 +162,17 @@ getThreads settings board_id board_thread_ids =
path = "/threads?board_thread_id=in.(" ++ ids ++ ")&board_id=eq." ++ show board_id
ids :: String = intercalate "," $ map show board_thread_ids
postPosts
:: T.JSONSettings
-> [ Posts.Post ]
-> IO (Either HttpError [ PostId ])
postPosts settings posts =
post settings "/rpc/insert_posts_and_return_ids" payload True >>= return . eitherDecodeResponse
where
payload = encode $ object [ "new_posts" .= posts ]
eitherDecodeResponse :: (FromJSON a) => Either HttpError LBS.ByteString -> Either HttpError a
eitherDecodeResponse (Left err) = Left err
eitherDecodeResponse (Right bs) =

39
src/JSONCommonTypes.hs Normal file
View File

@ -0,0 +1,39 @@
module JSONCommonTypes
( File (..)
, Cyclical (..)
) where
import Data.Text (Text)
import qualified Data.Text as T
import Data.Aeson
import Data.Aeson.Types (typeMismatch)
import GHC.Generics
data Cyclical = Cyclical Int deriving (Show, Generic)
instance FromJSON Cyclical where
parseJSON (Number n) = return $ Cyclical (floor n)
parseJSON (String s) =
case reads (T.unpack s) :: [(Int, String)] of
[(n, "")] -> return $ Cyclical n
_ -> typeMismatch "Int or String containing Int" (String s)
parseJSON invalid = typeMismatch "Int or String" invalid
data File = File
{ id :: Text
, mime :: Maybe Text
, ext :: Text
, h :: Maybe Int
, w :: Maybe Int
, fsize :: Int
, filename :: Text
, spoiler :: Maybe Bool
, md5 :: Text
, file_path :: Text
, thumb_path :: Text
} deriving (Show, Generic)
instance FromJSON File
--instance ToJSON File

View File

@ -1,36 +1,27 @@
module JSONParsing
( Thread(..)
, File(..)
, Catalog(..)
, parseJSONFile
( Thread (..)
, Catalog (..)
, parseJSONCatalog
, parsePosts
) where
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE OverloadedStrings #-}
import Data.Aeson
import Data.Text (Text)
import GHC.Generics
import qualified Data.ByteString.Lazy as B
import qualified Data.Text as T
import Data.Aeson.Types (typeMismatch)
import Data.Aeson
data Cyclical = Cyclical Int deriving (Show, Generic)
instance FromJSON Cyclical where
parseJSON (Number n) = return $ Cyclical (floor n)
parseJSON (String s) =
case reads (T.unpack s) :: [(Int, String)] of
[(n, "")] -> return $ Cyclical n
_ -> typeMismatch "Int or String containing Int" (String s)
parseJSON invalid = typeMismatch "Int or String" invalid
import qualified JSONPost as Post
import qualified JSONCommonTypes as J
data Thread = Thread
{ no :: Int
, sub :: Maybe String
, com :: Maybe String
, name :: Maybe String
, capcode :: Maybe String
, sub :: Maybe Text
, com :: Maybe Text
, name :: Maybe Text
, capcode :: Maybe Text
, time :: Int
, omitted_posts :: Maybe Int
, omitted_images:: Maybe Int
@ -38,42 +29,27 @@ data Thread = Thread
, images :: Maybe Int
, sticky :: Maybe Int
, locked :: Maybe Int
, cyclical :: Maybe Cyclical
, cyclical :: Maybe J.Cyclical
, last_modified :: Int
, board :: String
, files :: Maybe [File]
, board :: Text
, files :: Maybe [J.File]
, resto :: Int
, unique_ips :: Maybe Int
} deriving (Show, Generic)
data File = File
{ id :: String
, mime :: String
, ext :: String
, h :: Maybe Int
, w :: Maybe Int
, fsize :: Int
, filename :: String
, spoiler :: Maybe Bool
, md5 :: String
, file_path :: String
, thumb_path :: String
} deriving (Show, Generic)
instance FromJSON Thread
--instance ToJSON Thread
data Catalog = Catalog
{ threads :: [Thread]
, page :: Int
} deriving (Show, Generic)
instance FromJSON Thread
--instance ToJSON Thread
instance FromJSON File
--instance ToJSON File
instance FromJSON Catalog
--instance ToJSON Catalog
parseJSONCatalog :: FilePath -> IO (Either String [Catalog])
parseJSONCatalog path = B.readFile path >>= return . eitherDecode
parseJSONFile :: FilePath -> IO (Either String [Catalog])
parseJSONFile path = do
jsonData <- B.readFile path
return $ eitherDecode jsonData
parsePosts :: FilePath -> IO (Either String Post.PostWrapper)
parsePosts path = B.readFile path >>= return . eitherDecode

35
src/JSONPost.hs Normal file
View File

@ -0,0 +1,35 @@
module JSONPost
( Post (..)
, PostWrapper (..)
) where
import Data.Text (Text)
import Data.Int (Int64)
import Data.Aeson (FromJSON)
import GHC.Generics
import qualified JSONCommonTypes as J
data Post = Post
{ no :: Int64
, com :: Maybe Text
, name :: Maybe Text
, time :: Int
, omitted_posts :: Maybe Int
, omitted_images :: Maybe Int
, sticky :: Maybe Int
, locked :: Maybe Int
, cyclical :: Maybe J.Cyclical
, last_modified :: Int
, board :: String
, files :: Maybe [J.File]
, resto :: Int
, unique_ips :: Maybe Int
} deriving (Show, Generic)
instance FromJSON Post
data PostWrapper = PostWrapper
{ posts :: [Post]
} deriving (Show, Generic)
instance FromJSON PostWrapper

View File

@ -1,4 +1,4 @@
module Types
module JSONSettings
( JSONSettings(..)
) where

19
src/PostsType.hs Normal file
View File

@ -0,0 +1,19 @@
{-# LANGUAGE DeriveAnyClass #-}
module PostsType
( Post (..) )
where
import GHC.Generics
import Data.Aeson (FromJSON, ToJSON)
import Data.Time.Clock (UTCTime) -- Required for timestamp with time zone
import Data.Int (Int64)
import Data.Text (Text)
data Post = Post
{ post_id :: Maybe Int64
, board_post_id :: Int64
, creation_time :: UTCTime
, body :: Maybe Text
, thread_id :: Int
} deriving (Show, Generic, FromJSON, ToJSON)