diff --git a/.gitignore b/.gitignore index 484f3c7..34b1d75 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,3 @@ .dbpassword -sql/spamnoticer_init.sql todo dist-newstyle/ diff --git a/chan-delorean.cabal b/chan-delorean.cabal index 78af0a6..bb1e27d 100644 --- a/chan-delorean.cabal +++ b/chan-delorean.cabal @@ -65,7 +65,8 @@ executable chan-delorean main-is: Backfill.hs -- Modules included in this executable, other than Main. - -- other-modules: + other-modules: + JSONParsing -- LANGUAGE extensions used by modules in this package. -- other-extensions: @@ -75,7 +76,10 @@ executable chan-delorean aeson, bytestring, cmdargs, - directory + directory, + filepath, + containers, + text -- Directories containing source files. hs-source-dirs: src diff --git a/sql/initialize.sql b/sql/initialize.sql index 02824b4..c199daa 100644 --- a/sql/initialize.sql +++ b/sql/initialize.sql @@ -72,6 +72,9 @@ CREATE INDEX posts_body_search_idx ON posts USING GIN (body_search_index); CREATE INDEX posts_thread_id_idx ON posts (thread_id); CREATE INDEX posts_board_post_id_idx ON posts (board_post_id); +-- This is to optimize joins on thread_id and filtering/sorting by creation_time in 'posts' table. +CREATE INDEX posts_thread_id_creation_time_idx ON posts (thread_id, creation_time); + CREATE OR REPLACE FUNCTION update_post_body_search_index() RETURNS trigger AS $$ BEGIN NEW.body_search_index := to_tsvector('english', NEW.body); diff --git a/src/Backfill.hs b/src/Backfill.hs index 9af07e1..bbbfa37 100644 --- a/src/Backfill.hs +++ b/src/Backfill.hs @@ -5,11 +5,15 @@ module Main where import System.Exit +import Control.Monad (filterM) import Data.Aeson (FromJSON, decode) import qualified Data.ByteString.Lazy as B import System.Console.CmdArgs import GHC.Generics -import System.Directory (listDirectory) +import System.Directory (listDirectory, doesFileExist) +import System.FilePath (()) + +import JSONParsing data SettingsCLI = SettingsCLI { jsonFile :: FilePath @@ -29,10 +33,39 @@ settingsCLI = SettingsCLI } &= summary "Backfill v0.0.1" --- Function to list all files and directories inside the backup_read_root -listBackupContents :: JSONSettings -> IO () -listBackupContents settings = - listDirectory (backup_read_root settings) >>= mapM_ print +listCatalogDirectories :: JSONSettings -> IO [FilePath] +listCatalogDirectories settings = do + dirs <- listDirectory (backup_read_root settings) + filterM hasCatalog dirs + where + hasCatalog dir = do + let catalogPath = (backup_read_root settings) dir "catalog.json" + doesFileExist catalogPath + + +processBackupDirectory :: JSONSettings -> IO () +processBackupDirectory settings = do + putStrLn "JSON successfully read!" + print settings -- print the decoded JSON settings + dirs <- listCatalogDirectories settings + mapM_ print dirs + mapM_ processDir dirs + where + backupDir :: FilePath + backupDir = backup_read_root settings + + processDir dir = do + let catalogPath = backupDir dir "catalog.json" + putStrLn $ "catalog file path: " ++ catalogPath + + result <- parseJSONFile catalogPath + + case result of + Right catalogs -> + mapM_ (mapM_ (print . no) . threads) catalogs + Left errMsg -> + putStrLn $ "Failed to parse the JSON file in directory: " + ++ dir ++ ". Error: " ++ errMsg main :: IO () main = do @@ -49,7 +82,4 @@ main = do Nothing -> do putStrLn "Error: Invalid JSON format." exitFailure - Just settings -> do - putStrLn "JSON successfully read!" - print settings -- print the decoded JSON settings - listBackupContents settings + Just settings -> processBackupDirectory settings diff --git a/src/JSONParsing.hs b/src/JSONParsing.hs new file mode 100644 index 0000000..4ecccbe --- /dev/null +++ b/src/JSONParsing.hs @@ -0,0 +1,79 @@ +module JSONParsing + ( Thread(..) + , File(..) + , Catalog(..) + , parseJSONFile + ) where + +{-# LANGUAGE DeriveGeneric #-} +{-# LANGUAGE OverloadedStrings #-} + +import Data.Aeson +import GHC.Generics +import qualified Data.ByteString.Lazy as B +import qualified Data.Text as T +import Data.Aeson.Types (typeMismatch) + +data Cyclical = Cyclical Int deriving (Show, Generic) + +instance FromJSON Cyclical where + parseJSON (Number n) = return $ Cyclical (floor n) + parseJSON (String s) = + case reads (T.unpack s) :: [(Int, String)] of + [(n, "")] -> return $ Cyclical n + _ -> typeMismatch "Int or String containing Int" (String s) + + parseJSON invalid = typeMismatch "Int or String" invalid + +data Thread = Thread + { no :: Int + , sub :: Maybe String + , com :: Maybe String + , name :: Maybe String + , capcode :: Maybe String + , time :: Int + , omitted_posts :: Maybe Int + , omitted_images:: Maybe Int + , replies :: Maybe Int + , images :: Maybe Int + , sticky :: Maybe Int + , locked :: Maybe Int + , cyclical :: Maybe Cyclical + , last_modified :: Int + , board :: String + , files :: Maybe [File] + , resto :: Int + , unique_ips :: Maybe Int + } deriving (Show, Generic) + +data File = File + { id :: String + , mime :: String + , ext :: String + , h :: Maybe Int + , w :: Maybe Int + , fsize :: Int + , filename :: String + , spoiler :: Maybe Bool + , md5 :: String + , file_path :: String + , thumb_path :: String + } deriving (Show, Generic) + +data Catalog = Catalog + { threads :: [Thread] + , page :: Int + } deriving (Show, Generic) + +instance FromJSON Thread +--instance ToJSON Thread +instance FromJSON File +--instance ToJSON File +instance FromJSON Catalog +--instance ToJSON Catalog + + +parseJSONFile :: FilePath -> IO (Either String [Catalog]) +parseJSONFile path = do + jsonData <- B.readFile path + return $ eitherDecode jsonData