Compare commits
2 Commits
ce097414db
...
c18be31289
Author | SHA1 | Date |
---|---|---|
towards-a-new-leftypol | c18be31289 | |
towards-a-new-leftypol | 7bab7ea3f3 |
|
@ -68,10 +68,13 @@ executable chan-delorean
|
|||
other-modules:
|
||||
JSONParsing
|
||||
DataClient
|
||||
Types
|
||||
JSONSettings
|
||||
SitesType
|
||||
BoardsType
|
||||
ThreadType
|
||||
JSONPost
|
||||
JSONCommonTypes
|
||||
PostsType
|
||||
|
||||
-- LANGUAGE extensions used by modules in this package.
|
||||
-- other-extensions:
|
||||
|
|
|
@ -9,7 +9,7 @@ let
|
|||
}:
|
||||
mkDerivation {
|
||||
pname = "chan-delorean";
|
||||
version = "0.0.0.0";
|
||||
version = "0.0.2";
|
||||
src = ./.;
|
||||
isLibrary = false;
|
||||
isExecutable = true;
|
||||
|
|
|
@ -110,6 +110,69 @@ CREATE INDEX attachments_post_id_idx ON attachments (post_id);
|
|||
CREATE INDEX attachments_md5_hash_idx ON attachments (md5_hash);
|
||||
CREATE INDEX attachments_phash_bktree_index ON attachments USING spgist (phash bktree_ops);
|
||||
|
||||
|
||||
/*
|
||||
* Function Definitions
|
||||
*/
|
||||
|
||||
/*
|
||||
CREATE OR REPLACE FUNCTION insert_posts_and_return_ids(new_posts posts[])
|
||||
RETURNS TABLE (post_id bigint, board_post_id bigint) AS $$
|
||||
WITH inserted AS (
|
||||
INSERT INTO posts (board_post_id, creation_time, body, thread_id)
|
||||
SELECT np.board_post_id, np.creation_time, np.body, np.thread_id
|
||||
FROM unnest(new_posts) AS np
|
||||
ON CONFLICT (thread_id, board_post_id) DO NOTHING
|
||||
RETURNING post_id, board_post_id
|
||||
),
|
||||
selected AS (
|
||||
SELECT post_id, board_post_id
|
||||
FROM posts
|
||||
WHERE (thread_id, board_post_id) IN (SELECT thread_id, board_post_id FROM unnest(new_posts))
|
||||
)
|
||||
SELECT * FROM inserted
|
||||
UNION ALL
|
||||
SELECT * FROM selected WHERE (post_id, board_post_id) NOT IN (SELECT post_id, board_post_id FROM inserted);
|
||||
$$ LANGUAGE sql;
|
||||
|
||||
-- 3m37s for clean db
|
||||
-- 1m34s for full db (nothing inserted)
|
||||
|
||||
*/
|
||||
|
||||
CREATE OR REPLACE FUNCTION insert_posts_and_return_ids(new_posts posts[])
|
||||
RETURNS TABLE (post_id bigint, board_post_id bigint, thread_id bigint) AS $$
|
||||
WITH
|
||||
selected AS (
|
||||
SELECT post_id, board_post_id, thread_id
|
||||
FROM posts
|
||||
WHERE (thread_id, board_post_id) IN (SELECT thread_id, board_post_id FROM unnest(new_posts))
|
||||
),
|
||||
to_insert AS (
|
||||
SELECT np.*
|
||||
FROM unnest(new_posts) AS np
|
||||
LEFT OUTER JOIN selected s ON np.thread_id = s.thread_id AND np.board_post_id = s.board_post_id
|
||||
WHERE s.post_id IS NULL
|
||||
),
|
||||
inserted AS (
|
||||
INSERT INTO posts (board_post_id, creation_time, body, thread_id)
|
||||
SELECT board_post_id, creation_time, body, thread_id
|
||||
FROM to_insert
|
||||
RETURNING post_id, board_post_id, thread_id
|
||||
)
|
||||
SELECT * FROM inserted
|
||||
UNION ALL
|
||||
SELECT * FROM selected;
|
||||
$$ LANGUAGE sql;
|
||||
|
||||
-- 1:51 for clean db (this varies a lot)
|
||||
-- 1:21 for full db (nothing inserted)
|
||||
|
||||
|
||||
/*
|
||||
* Permissions
|
||||
*/
|
||||
|
||||
CREATE ROLE chan_archive_anon nologin;
|
||||
GRANT CONNECT ON DATABASE chan_archives TO chan_archive_anon;
|
||||
GRANT SELECT ON sites TO chan_archive_anon;
|
||||
|
@ -130,9 +193,12 @@ GRANT ALL ON threads TO chan_archiver;
|
|||
GRANT ALL ON posts TO chan_archiver;
|
||||
GRANT ALL ON attachments TO chan_archiver;
|
||||
GRANT EXECUTE ON FUNCTION update_post_body_search_index TO chan_archiver;
|
||||
GRANT EXECUTE ON FUNCTION insert_posts_and_return_ids TO chan_archiver;
|
||||
GRANT usage, select ON SEQUENCE sites_site_id_seq TO chan_archiver;
|
||||
GRANT usage, select ON SEQUENCE boards_board_id_seq TO chan_archiver;
|
||||
GRANT usage, select ON SEQUENCE threads_thread_id_seq TO chan_archiver;
|
||||
GRANT usage, select ON SEQUENCE posts_post_id_seq TO chan_archiver;
|
||||
|
||||
GRANT chan_archiver TO admin;
|
||||
|
||||
COMMIT;
|
||||
|
|
112
src/Backfill.hs
112
src/Backfill.hs
|
@ -1,7 +1,3 @@
|
|||
-- {-# LANGUAGE DeriveDataTypeable #-}
|
||||
-- {-# LANGUAGE DeriveGeneric #-}
|
||||
-- {-# LANGUAGE OverloadedStrings #-}
|
||||
|
||||
module Main where
|
||||
|
||||
import System.Exit
|
||||
|
@ -16,13 +12,14 @@ import qualified Data.Set as Set
|
|||
import Data.Time.Clock.POSIX (posixSecondsToUTCTime)
|
||||
import Data.Time.Clock (UTCTime)
|
||||
|
||||
|
||||
import JSONParsing
|
||||
import Types
|
||||
import JSONSettings
|
||||
import qualified JSONPost as JSONPosts
|
||||
import qualified DataClient as Client
|
||||
import qualified SitesType as Sites
|
||||
import qualified BoardsType as Boards
|
||||
import qualified ThreadType as Threads
|
||||
import qualified PostsType as Posts
|
||||
|
||||
data SettingsCLI = SettingsCLI
|
||||
{ jsonFile :: FilePath
|
||||
|
@ -31,7 +28,7 @@ data SettingsCLI = SettingsCLI
|
|||
settingsCLI :: SettingsCLI
|
||||
settingsCLI = SettingsCLI
|
||||
{ jsonFile = def &= args &= typ "settings-jsonfile-path"
|
||||
} &= summary "Backfill v0.0.1"
|
||||
} &= summary "Backfill v0.0.2"
|
||||
|
||||
|
||||
listCatalogDirectories :: JSONSettings -> IO [FilePath]
|
||||
|
@ -164,21 +161,107 @@ ensureThreads settings board all_threads = do
|
|||
return $ archived_threads ++ new_threads
|
||||
|
||||
|
||||
readPosts
|
||||
:: JSONSettings
|
||||
-> Boards.Board
|
||||
-> Threads.Thread
|
||||
-> IO (Threads.Thread, [ JSONPosts.Post ])
|
||||
readPosts settings board thread = do
|
||||
result <- parsePosts thread_filename
|
||||
|
||||
case result of
|
||||
Left err -> do
|
||||
putStrLn $ "Failed to parse the JSON file " ++ thread_filename ++ " error: " ++ err
|
||||
exitFailure
|
||||
Right posts_wrapper -> return $ (thread, JSONPosts.posts posts_wrapper)
|
||||
|
||||
where
|
||||
thread_filename :: FilePath
|
||||
thread_filename = backupDir </> "res" </> ((show $ Threads.board_thread_id thread) ++ ".json")
|
||||
|
||||
backupDir :: FilePath
|
||||
backupDir = backup_read_root settings </> (Boards.pathpart board)
|
||||
|
||||
|
||||
ensurePosts
|
||||
:: JSONSettings
|
||||
-> Boards.Board
|
||||
-> [(Threads.Thread, [ Posts.Post ])]
|
||||
-> IO [(Threads.Thread, [ Posts.Post ])]
|
||||
ensurePosts = undefined
|
||||
|
||||
|
||||
-- Convert Post to DbPost
|
||||
apiPostToArchivePost :: Threads.Thread -> JSONPosts.Post -> Posts.Post
|
||||
apiPostToArchivePost thread post =
|
||||
Posts.Post
|
||||
{ Posts.post_id = Nothing
|
||||
, Posts.board_post_id = JSONPosts.no post
|
||||
, Posts.creation_time = posixSecondsToUTCTime (realToFrac $ JSONPosts.time post)
|
||||
, Posts.body = JSONPosts.com post
|
||||
, Posts.thread_id = Threads.thread_id thread
|
||||
}
|
||||
|
||||
-- | A version of 'concatMap' that works with a monadic predicate.
|
||||
-- Stolen from package extra Control.Monad.Extra
|
||||
concatMapM :: Monad m => (a -> m [b]) -> [a] -> m [b]
|
||||
{-# INLINE concatMapM #-}
|
||||
concatMapM op = foldr f (pure [])
|
||||
where f x xs = do
|
||||
x_ <- op x
|
||||
|
||||
if null x_
|
||||
then xs
|
||||
else do
|
||||
xs_ <- xs
|
||||
pure $ x_ ++ xs_
|
||||
|
||||
|
||||
processBoard :: JSONSettings -> Boards.Board -> IO ()
|
||||
processBoard settings board = do
|
||||
let catalogPath = backupDir </> (Boards.pathpart board) </> "catalog.json"
|
||||
let catalogPath = backupDir </> "catalog.json"
|
||||
putStrLn $ "catalog file path: " ++ catalogPath
|
||||
|
||||
result <- parseJSONFile catalogPath
|
||||
result <- parseJSONCatalog catalogPath
|
||||
|
||||
case result of
|
||||
Right catalogs -> do
|
||||
let threads_on_board = concatMap threads catalogs
|
||||
|
||||
new_threads <- ensureThreads settings board threads_on_board
|
||||
-- catalogs can be turned into [ Thread ]
|
||||
-- ensureThreads :: ( Board, [ Thread ] ) -> IO ()
|
||||
-- mapM_ (print . no) threads_on_board
|
||||
all_threads_for_board :: [ Threads.Thread ] <- ensureThreads settings board threads_on_board
|
||||
|
||||
all_posts_on_board :: [(Threads.Thread, [ JSONPosts.Post ])] <- mapM (readPosts settings board) all_threads_for_board
|
||||
|
||||
-- putStrLn $ "Number of posts on /" ++ (Boards.pathpart board) ++ "/ " ++ (show $ length all_posts_on_board)
|
||||
posts_result <- Client.postPosts settings (concatMap (\(t, posts) -> map (apiPostToArchivePost t) posts) all_posts_on_board)
|
||||
|
||||
-- TODO: why doesn't it insert posts for threads that already exist? we can have new posts!
|
||||
|
||||
case posts_result of
|
||||
Left err -> print err
|
||||
Right new_ids -> do
|
||||
putStrLn "Sum of post_ids:"
|
||||
print $ sum $ map Client.post_id new_ids
|
||||
putStrLn "Sum of board_post_ids:"
|
||||
print $ sum $ map Client.board_post_id new_ids
|
||||
|
||||
-- max: 18,645
|
||||
-- min: 147
|
||||
-- total: 191,628
|
||||
--
|
||||
-- f :: Threads.Thread -> [ Posts.Post ]
|
||||
-- for each thread we have to call a function that
|
||||
-- - reads the thread under the board directory:
|
||||
-- - t = backupDir </> "res' </> ((show $ no thread) ++ ".json")
|
||||
--
|
||||
-- do we want an ensurethreads?
|
||||
-- - then for each thread, grab the posts from json and see if they exist
|
||||
-- - this might have to be done 350 times per board
|
||||
--
|
||||
-- So we need a function (Threads.Thread, [ Posts.Post ]) -> ??? [ new Post type? ]
|
||||
-- - why?
|
||||
-- - well because the new post type will have a thread_id, which is known to be unique
|
||||
-- - so we need to query the db for this same (thread_id (from Thread), no (from Post))
|
||||
return ()
|
||||
Left errMsg ->
|
||||
putStrLn $ "Failed to parse the JSON file in directory: "
|
||||
|
@ -186,7 +269,7 @@ processBoard settings board = do
|
|||
|
||||
where
|
||||
backupDir :: FilePath
|
||||
backupDir = backup_read_root settings
|
||||
backupDir = backup_read_root settings </> (Boards.pathpart board)
|
||||
|
||||
|
||||
processBackupDirectory :: JSONSettings -> IO ()
|
||||
|
@ -208,6 +291,7 @@ processBackupDirectory settings = do
|
|||
let boards :: [ Boards.Board ] = archived_boards ++ created_boards
|
||||
mapM_ (processBoard settings) boards
|
||||
|
||||
|
||||
main :: IO ()
|
||||
main = do
|
||||
settingsValue <- cmdArgs settingsCLI
|
||||
|
|
|
@ -1,7 +1,9 @@
|
|||
{-# LANGUAGE OverloadedStrings #-}
|
||||
{-# LANGUAGE DeriveAnyClass #-}
|
||||
|
||||
module DataClient
|
||||
( HttpError(..)
|
||||
, PostId (..)
|
||||
, get
|
||||
, getSiteBoards
|
||||
, getAllSites
|
||||
|
@ -10,8 +12,10 @@ module DataClient
|
|||
, postBoards
|
||||
, getThreads
|
||||
, postThreads
|
||||
, postPosts
|
||||
) where
|
||||
|
||||
import Data.Int (Int64)
|
||||
import Network.HTTP.Simple
|
||||
import qualified Data.ByteString.Lazy as LBS
|
||||
import qualified Data.ByteString.Lazy.Char8 as LC8
|
||||
|
@ -27,17 +31,24 @@ import Data.Aeson
|
|||
, encode
|
||||
, Value
|
||||
)
|
||||
import GHC.Generics
|
||||
|
||||
import qualified Types as T
|
||||
import qualified JSONSettings as T
|
||||
import qualified SitesType as Sites
|
||||
import qualified BoardsType as Boards
|
||||
import qualified ThreadType as Threads
|
||||
import qualified PostsType as Posts
|
||||
|
||||
data HttpError
|
||||
= HttpException SomeException
|
||||
| StatusCodeError Int LBS.ByteString
|
||||
deriving (Show)
|
||||
|
||||
data PostId = PostId
|
||||
{ post_id :: Int64
|
||||
, board_post_id :: Int64
|
||||
, thread_id :: Int64
|
||||
} deriving (Show, Generic, FromJSON)
|
||||
|
||||
get :: T.JSONSettings -> String -> IO (Either HttpError LBS.ByteString)
|
||||
get settings path = do
|
||||
|
@ -65,6 +76,7 @@ post settings path payload return_repr = do
|
|||
$ initReq
|
||||
|
||||
putStrLn $ "posting to " ++ requestUrl
|
||||
-- putStrLn $ "Payload: " ++ (LC8.unpack payload)
|
||||
handleHttp (httpLBS req)
|
||||
|
||||
where
|
||||
|
@ -150,6 +162,17 @@ getThreads settings board_id board_thread_ids =
|
|||
path = "/threads?board_thread_id=in.(" ++ ids ++ ")&board_id=eq." ++ show board_id
|
||||
ids :: String = intercalate "," $ map show board_thread_ids
|
||||
|
||||
postPosts
|
||||
:: T.JSONSettings
|
||||
-> [ Posts.Post ]
|
||||
-> IO (Either HttpError [ PostId ])
|
||||
postPosts settings posts =
|
||||
post settings "/rpc/insert_posts_and_return_ids" payload True >>= return . eitherDecodeResponse
|
||||
|
||||
where
|
||||
payload = encode $ object [ "new_posts" .= posts ]
|
||||
|
||||
|
||||
eitherDecodeResponse :: (FromJSON a) => Either HttpError LBS.ByteString -> Either HttpError a
|
||||
eitherDecodeResponse (Left err) = Left err
|
||||
eitherDecodeResponse (Right bs) =
|
||||
|
|
|
@ -0,0 +1,39 @@
|
|||
module JSONCommonTypes
|
||||
( File (..)
|
||||
, Cyclical (..)
|
||||
) where
|
||||
|
||||
import Data.Text (Text)
|
||||
import qualified Data.Text as T
|
||||
import Data.Aeson
|
||||
import Data.Aeson.Types (typeMismatch)
|
||||
import GHC.Generics
|
||||
|
||||
data Cyclical = Cyclical Int deriving (Show, Generic)
|
||||
|
||||
instance FromJSON Cyclical where
|
||||
parseJSON (Number n) = return $ Cyclical (floor n)
|
||||
parseJSON (String s) =
|
||||
case reads (T.unpack s) :: [(Int, String)] of
|
||||
[(n, "")] -> return $ Cyclical n
|
||||
_ -> typeMismatch "Int or String containing Int" (String s)
|
||||
|
||||
parseJSON invalid = typeMismatch "Int or String" invalid
|
||||
|
||||
|
||||
data File = File
|
||||
{ id :: Text
|
||||
, mime :: Maybe Text
|
||||
, ext :: Text
|
||||
, h :: Maybe Int
|
||||
, w :: Maybe Int
|
||||
, fsize :: Int
|
||||
, filename :: Text
|
||||
, spoiler :: Maybe Bool
|
||||
, md5 :: Text
|
||||
, file_path :: Text
|
||||
, thumb_path :: Text
|
||||
} deriving (Show, Generic)
|
||||
|
||||
instance FromJSON File
|
||||
--instance ToJSON File
|
|
@ -1,36 +1,27 @@
|
|||
module JSONParsing
|
||||
( Thread (..)
|
||||
, File(..)
|
||||
, Catalog (..)
|
||||
, parseJSONFile
|
||||
, parseJSONCatalog
|
||||
, parsePosts
|
||||
) where
|
||||
|
||||
{-# LANGUAGE DeriveGeneric #-}
|
||||
{-# LANGUAGE OverloadedStrings #-}
|
||||
|
||||
import Data.Aeson
|
||||
import Data.Text (Text)
|
||||
import GHC.Generics
|
||||
import qualified Data.ByteString.Lazy as B
|
||||
import qualified Data.Text as T
|
||||
import Data.Aeson.Types (typeMismatch)
|
||||
import Data.Aeson
|
||||
|
||||
data Cyclical = Cyclical Int deriving (Show, Generic)
|
||||
|
||||
instance FromJSON Cyclical where
|
||||
parseJSON (Number n) = return $ Cyclical (floor n)
|
||||
parseJSON (String s) =
|
||||
case reads (T.unpack s) :: [(Int, String)] of
|
||||
[(n, "")] -> return $ Cyclical n
|
||||
_ -> typeMismatch "Int or String containing Int" (String s)
|
||||
|
||||
parseJSON invalid = typeMismatch "Int or String" invalid
|
||||
import qualified JSONPost as Post
|
||||
import qualified JSONCommonTypes as J
|
||||
|
||||
data Thread = Thread
|
||||
{ no :: Int
|
||||
, sub :: Maybe String
|
||||
, com :: Maybe String
|
||||
, name :: Maybe String
|
||||
, capcode :: Maybe String
|
||||
, sub :: Maybe Text
|
||||
, com :: Maybe Text
|
||||
, name :: Maybe Text
|
||||
, capcode :: Maybe Text
|
||||
, time :: Int
|
||||
, omitted_posts :: Maybe Int
|
||||
, omitted_images:: Maybe Int
|
||||
|
@ -38,42 +29,27 @@ data Thread = Thread
|
|||
, images :: Maybe Int
|
||||
, sticky :: Maybe Int
|
||||
, locked :: Maybe Int
|
||||
, cyclical :: Maybe Cyclical
|
||||
, cyclical :: Maybe J.Cyclical
|
||||
, last_modified :: Int
|
||||
, board :: String
|
||||
, files :: Maybe [File]
|
||||
, board :: Text
|
||||
, files :: Maybe [J.File]
|
||||
, resto :: Int
|
||||
, unique_ips :: Maybe Int
|
||||
} deriving (Show, Generic)
|
||||
|
||||
data File = File
|
||||
{ id :: String
|
||||
, mime :: String
|
||||
, ext :: String
|
||||
, h :: Maybe Int
|
||||
, w :: Maybe Int
|
||||
, fsize :: Int
|
||||
, filename :: String
|
||||
, spoiler :: Maybe Bool
|
||||
, md5 :: String
|
||||
, file_path :: String
|
||||
, thumb_path :: String
|
||||
} deriving (Show, Generic)
|
||||
instance FromJSON Thread
|
||||
--instance ToJSON Thread
|
||||
|
||||
data Catalog = Catalog
|
||||
{ threads :: [Thread]
|
||||
, page :: Int
|
||||
} deriving (Show, Generic)
|
||||
|
||||
instance FromJSON Thread
|
||||
--instance ToJSON Thread
|
||||
instance FromJSON File
|
||||
--instance ToJSON File
|
||||
instance FromJSON Catalog
|
||||
--instance ToJSON Catalog
|
||||
|
||||
parseJSONCatalog :: FilePath -> IO (Either String [Catalog])
|
||||
parseJSONCatalog path = B.readFile path >>= return . eitherDecode
|
||||
|
||||
parseJSONFile :: FilePath -> IO (Either String [Catalog])
|
||||
parseJSONFile path = do
|
||||
jsonData <- B.readFile path
|
||||
return $ eitherDecode jsonData
|
||||
parsePosts :: FilePath -> IO (Either String Post.PostWrapper)
|
||||
parsePosts path = B.readFile path >>= return . eitherDecode
|
||||
|
|
|
@ -0,0 +1,35 @@
|
|||
module JSONPost
|
||||
( Post (..)
|
||||
, PostWrapper (..)
|
||||
) where
|
||||
|
||||
import Data.Text (Text)
|
||||
import Data.Int (Int64)
|
||||
import Data.Aeson (FromJSON)
|
||||
import GHC.Generics
|
||||
import qualified JSONCommonTypes as J
|
||||
|
||||
data Post = Post
|
||||
{ no :: Int64
|
||||
, com :: Maybe Text
|
||||
, name :: Maybe Text
|
||||
, time :: Int
|
||||
, omitted_posts :: Maybe Int
|
||||
, omitted_images :: Maybe Int
|
||||
, sticky :: Maybe Int
|
||||
, locked :: Maybe Int
|
||||
, cyclical :: Maybe J.Cyclical
|
||||
, last_modified :: Int
|
||||
, board :: String
|
||||
, files :: Maybe [J.File]
|
||||
, resto :: Int
|
||||
, unique_ips :: Maybe Int
|
||||
} deriving (Show, Generic)
|
||||
|
||||
instance FromJSON Post
|
||||
|
||||
data PostWrapper = PostWrapper
|
||||
{ posts :: [Post]
|
||||
} deriving (Show, Generic)
|
||||
|
||||
instance FromJSON PostWrapper
|
|
@ -1,4 +1,4 @@
|
|||
module Types
|
||||
module JSONSettings
|
||||
( JSONSettings(..)
|
||||
) where
|
||||
|
|
@ -0,0 +1,19 @@
|
|||
{-# LANGUAGE DeriveAnyClass #-}
|
||||
|
||||
module PostsType
|
||||
( Post (..) )
|
||||
where
|
||||
|
||||
import GHC.Generics
|
||||
import Data.Aeson (FromJSON, ToJSON)
|
||||
import Data.Time.Clock (UTCTime) -- Required for timestamp with time zone
|
||||
import Data.Int (Int64)
|
||||
import Data.Text (Text)
|
||||
|
||||
data Post = Post
|
||||
{ post_id :: Maybe Int64
|
||||
, board_post_id :: Int64
|
||||
, creation_time :: UTCTime
|
||||
, body :: Maybe Text
|
||||
, thread_id :: Int
|
||||
} deriving (Show, Generic, FromJSON, ToJSON)
|
Loading…
Reference in New Issue