From 29c8f4eedb2e3f4567031a9dcd50b89dbb7a883f Mon Sep 17 00:00:00 2001 From: towards-a-new-leftypol Date: Sat, 20 Jan 2024 21:04:13 -0500 Subject: [PATCH] WIP: Save Attachments - nothing works well but the code to compute sha256 and perceptual hash on files is there... --- chan-delorean.cabal | 5 ++++- nix-support/hip.nix | 17 ++++++++++++++++ nix-support/http-client.nix | 13 ++++++++++++ nix-support/http-conduit.nix | 35 +++++++++++++++++++++++++++++++++ nix-support/perceptual-hash.nix | 17 ++++++++++++++++ nix-support/repa-custom.nix | 14 +++++++++++++ nix-support/repa.nix | 13 ++++++++++++ shell.nix | 13 +++++++++--- src/Backfill.hs | 22 +++++++++++++++++---- src/Data/WordUtil.hs | 18 +++++++++++++++++ src/DataClient.hs | 29 ++++++++++++++++++++++++--- 11 files changed, 185 insertions(+), 11 deletions(-) create mode 100644 nix-support/hip.nix create mode 100644 nix-support/http-client.nix create mode 100644 nix-support/http-conduit.nix create mode 100644 nix-support/perceptual-hash.nix create mode 100644 nix-support/repa-custom.nix create mode 100644 nix-support/repa.nix create mode 100644 src/Data/WordUtil.hs diff --git a/chan-delorean.cabal b/chan-delorean.cabal index ae32bb7..d36181e 100644 --- a/chan-delorean.cabal +++ b/chan-delorean.cabal @@ -77,6 +77,7 @@ executable chan-delorean Common.PostsType AttachmentType Hash + Data.WordUtil -- LANGUAGE extensions used by modules in this package. -- other-extensions: @@ -90,13 +91,15 @@ executable chan-delorean filepath, containers, text, + http-client, http-conduit, safe-exceptions, http-types, time, cryptonite, memory, - mime-types + mime-types, + perceptual-hash -- Directories containing source files. hs-source-dirs: src diff --git a/nix-support/hip.nix b/nix-support/hip.nix new file mode 100644 index 0000000..b0b050e --- /dev/null +++ b/nix-support/hip.nix @@ -0,0 +1,17 @@ +{ nixpkgs ? import {} }: + +let + haskellPackages = nixpkgs.haskellPackages.override { + overrides = self: super: { + repa = import ./repa.nix { inherit nixpkgs; }; + }; + }; + + hip = haskellPackages.callCabal2nix "hip" (nixpkgs.fetchFromGitHub { + owner = "lehins"; + repo = "hip"; + rev = "9f1111ea8e3f6d284404074cb6ac3e2ff164f0fe"; + sha256 = "sha256-/GhspMhU5MjFu7B6AU4tkaqqNGpR3daoOJpuatHemlM="; + }) {}; +in + hip diff --git a/nix-support/http-client.nix b/nix-support/http-client.nix new file mode 100644 index 0000000..0a8fd38 --- /dev/null +++ b/nix-support/http-client.nix @@ -0,0 +1,13 @@ +{ nixpkgs ? import {} }: + +let + haskellPackages = nixpkgs.haskellPackages; + http-client = haskellPackages.callCabal2nix "http-client" (nixpkgs.fetchFromGitHub { + owner = "towards-a-new-leftypol"; + repo = "http-client"; + rev = "7900de1687fe1ec244abf9801009d27a619a3ba0"; + sha256 = ""; + } + "/http-client") {}; +in + http-client + diff --git a/nix-support/http-conduit.nix b/nix-support/http-conduit.nix new file mode 100644 index 0000000..130e7c6 --- /dev/null +++ b/nix-support/http-conduit.nix @@ -0,0 +1,35 @@ +{ nixpkgs ? import {} }: + +let + haskellPackages = nixpkgs.haskellPackages; + + src = nixpkgs.fetchFromGitHub { + owner = "towards-a-new-leftypol"; + repo = "http-client"; + rev = "a32d92fff9171a8beb948b430e274a1667b3ca35"; + sha256 = "sha256-LfBTsB2fHZ1z+wvt3mowowXL+Ta7jesj+L3Y8NqSmfI="; + }; + + http-client = haskellPackages.callCabal2nix "http-client" (src + "/http-client") {}; + + conduitPackages = nixpkgs.haskellPackages.override { + overrides = self: super: { + http-client = http-client; + }; + }; + + http-conduit = conduitPackages.callCabal2nix "http-conduit" (src + "/http-conduit") {}; + + env = http-client.env.overrideAttrs (oldAttrs: { + buildInputs = oldAttrs.buildInputs ++ [ + haskellPackages.cabal-install + ]; + }); +in + + { + http-conduit = http-conduit; + http-client = http-client; + env = env; + } + diff --git a/nix-support/perceptual-hash.nix b/nix-support/perceptual-hash.nix new file mode 100644 index 0000000..b3d926c --- /dev/null +++ b/nix-support/perceptual-hash.nix @@ -0,0 +1,17 @@ +{ nixpkgs ? import {} }: + +let + haskellPackages = nixpkgs.haskellPackages.override { + overrides = self: super: { + hip = import ./hip.nix { inherit nixpkgs; }; + }; + }; + + perceptual-hash = haskellPackages.callCabal2nix "perceptual-hash" (nixpkgs.fetchFromGitHub { + owner = "vmchale"; + repo = "phash"; + rev = "90fa59bec93b6adcad614d37033c1f2e059b6e46"; + sha256 = "sha256-CysvgcgzIuvKFEwNnCYHB6MXGzQnBQN7Nuo6HADuavk="; + }) {}; +in + perceptual-hash diff --git a/nix-support/repa-custom.nix b/nix-support/repa-custom.nix new file mode 100644 index 0000000..ff85ee5 --- /dev/null +++ b/nix-support/repa-custom.nix @@ -0,0 +1,14 @@ +{ nixpkgs ? import {} }: + +let + haskellPackages = nixpkgs.haskellPackages; + repa = haskellPackages.callCabal2nix "repa" ./repa/repa {}; + + env = repa.env.overrideAttrs (oldAttrs: { + buildInputs = oldAttrs.buildInputs ++ [ + haskellPackages.cabal-install + ]; + }); + +in + if nixpkgs.lib.inNixShell then env else repa diff --git a/nix-support/repa.nix b/nix-support/repa.nix new file mode 100644 index 0000000..aa4bdd9 --- /dev/null +++ b/nix-support/repa.nix @@ -0,0 +1,13 @@ +{ nixpkgs ? import {} }: + +let + haskellPackages = nixpkgs.haskellPackages; + repa = haskellPackages.callCabal2nix "repa" (nixpkgs.fetchFromGitHub { + owner = "towards-a-new-leftypol"; + repo = "repa"; + rev = "8d30d271d0dc87f39fdc8dd6d330a9c59174892e"; + sha256 = "sha256-ECx7YhOFHQDy+/qlMCcM+EHud/r/3jVTPeWXVLUOawE="; + } + "/repa") {}; +in + repa + diff --git a/shell.nix b/shell.nix index 29292c1..e3a7316 100644 --- a/shell.nix +++ b/shell.nix @@ -3,9 +3,13 @@ let inherit (nixpkgs) pkgs; + perceptual-hash = import ./nix-support/perceptual-hash.nix { inherit nixpkgs; }; + http-conduit = import ./nix-support/http-conduit.nix { inherit nixpkgs; }; + f = { mkDerivation, base, stdenv, cabal-install, aeson, safe-exceptions, bytestring, cmdargs, - http-conduit, cryptonite, memory, mime-types + http-conduit, cryptonite, memory, mime-types, + perceptual-hash }: mkDerivation { pname = "chan-delorean"; @@ -15,7 +19,7 @@ let isExecutable = true; executableHaskellDepends = [ base safe-exceptions aeson bytestring cmdargs http-conduit - cryptonite memory mime-types + cryptonite memory mime-types perceptual-hash ]; testHaskellDepends = [ cabal-install ]; license = "unknown"; @@ -27,7 +31,10 @@ let variant = if doBenchmark then pkgs.haskell.lib.doBenchmark else pkgs.lib.id; - drv = variant (haskellPackages.callPackage f {}); + drv = variant (haskellPackages.callPackage f { + perceptual-hash = perceptual-hash; + http-conduit = http-conduit.http-conduit; + }); enhancedDrv = if pkgs.lib.inNixShell then drv.env.overrideAttrs (oldAttrs: { diff --git a/src/Backfill.hs b/src/Backfill.hs index 3c59c12..0e48a98 100644 --- a/src/Backfill.hs +++ b/src/Backfill.hs @@ -23,6 +23,7 @@ import Data.Maybe (fromJust) import Data.Text (Text, unpack) import Data.Text.Encoding (decodeUtf8) import Network.Mime (defaultMimeLookup, defaultMimeType) +import PerceptualHash (fileHash) import JSONParsing import JSONSettings @@ -35,6 +36,7 @@ import qualified ThreadType as Threads import qualified AttachmentType as At import qualified Common.PostsType as Posts import qualified Hash as Hash +import qualified Data.WordUtil as Words newtype SettingsCLI = SettingsCLI { jsonFile :: FilePath @@ -261,7 +263,7 @@ processFiles settings post_pairs = do -- perfect just means that our posts have case existing_attachments_result of Left err -> do - putStrLn $ "Error fetching boards: " ++ show err + putStrLn $ "Error fetching attachments: " ++ show err exitFailure Right existing_attachments -> do let map_existing :: Map.Map Int64 [ At.Attachment ] = @@ -290,6 +292,8 @@ processFiles settings post_pairs = do -- perfect just means that our posts have let to_insert_ = filter ((`Set.notMember` existing_hashes) . At.sha256_hash) have_hash + _ <- Client.postAttachments settings to_insert_ + -- TODO: Concat all values in to_insert_map and -- go ahead and compute sha256 and phashes on them. -- ensure that the sha256 in to_insert_map is not in map_existing @@ -300,12 +304,22 @@ processFiles settings post_pairs = do -- perfect just means that our posts have computeAttachmentHash :: (At.Paths, At.Attachment) -> IO At.Attachment computeAttachmentHash (p, q) = do let f = backup_read_root settings <> "/" <> unpack (At.file_path p) + sha256_sum <- Hash.computeSHA256 f - return q { At.sha256_hash = sha256_sum } + either_phash <- fileHash f - computeAttachmentPhash :: (At.Paths, At.Attachment) -> IO At.Attachment - computeAttachmentPhash = undefined + phash :: Int64 <- case either_phash of + Left err_str -> do + putStrLn $ "Failed to compute phash for file " ++ (unpack sha256_sum) ++ " " ++ f ++ " " ++ err_str + return (-1) + Right phash_w -> return $ Words.wordToSignedInt64 phash_w + + + return q + { At.sha256_hash = sha256_sum + , At.phash = phash + } parseLegacyPaths :: JSONPosts.Post -> Maybe At.Paths parseLegacyPaths post = do diff --git a/src/Data/WordUtil.hs b/src/Data/WordUtil.hs new file mode 100644 index 0000000..6a85eb5 --- /dev/null +++ b/src/Data/WordUtil.hs @@ -0,0 +1,18 @@ +-- These two functions were stolen from the module +-- Data.ProtoLens.Encoding.Bytes from the proto-lens library. + +module Data.WordUtil +( signedInt64ToWord +, wordToSignedInt64 +) where + +import Data.Int (Int64) +import Data.Word (Word64) +import Data.Bits ((.&.), xor, shiftR, shiftL) + +signedInt64ToWord :: Int64 -> Word64 +signedInt64ToWord n = fromIntegral $ shiftL n 1 `xor` shiftR n 63 + +wordToSignedInt64 :: Word64 -> Int64 +wordToSignedInt64 n + = fromIntegral (shiftR n 1) `xor` negate (fromIntegral $ n .&. 1) diff --git a/src/DataClient.hs b/src/DataClient.hs index 2574f29..0ba11e5 100644 --- a/src/DataClient.hs +++ b/src/DataClient.hs @@ -14,10 +14,17 @@ module DataClient , postThreads , postPosts , getAttachments + , postAttachments ) where import Data.Int (Int64) -import Network.HTTP.Simple +import Network.HTTP.Simple hiding (httpLbs) +import Network.HTTP.Client + ( newManager + , managerSetMaxHeaderLength + , httpLbs + ) +import Network.HTTP.Client.Conduit (defaultManagerSettings) import qualified Data.ByteString.Lazy as LBS import qualified Data.ByteString.Lazy.Char8 as LC8 import Network.HTTP.Types.Status (statusCode) @@ -66,7 +73,10 @@ get settings path = do initReq <- parseRequest requestUrl let req = setRequestHeader "Authorization" [C8.pack $ "Bearer " ++ T.jwt settings] initReq putStrLn $ "calling " ++ requestUrl - handleHttp (httpLBS req) + + let man_settings = managerSetMaxHeaderLength (16384 * 4) defaultManagerSettings + manager <- newManager man_settings + handleHttp (httpLbs req manager) post @@ -106,7 +116,9 @@ handleHttp action = do in if 200 <= (statusCode $ getResponseStatus response) && (statusCode $ getResponseStatus response) < 300 then return $ Right responseBody else return $ Left (StatusCodeError (statusCode $ getResponseStatus response) responseBody) - Left e -> return $ Left $ HttpException e + Left e -> do + putStrLn "Some nasty http exception must have occurred" + return $ Left $ HttpException e getSiteBoards :: T.JSONSettings -> Int -> IO (Either HttpError [ Boards.Board ]) @@ -182,6 +194,17 @@ getAttachments settings post_ids = hashes :: String = intercalate "," $ (map show post_ids) +postAttachments + :: T.JSONSettings + -> [ Attachments.Attachment ] + -> IO (Either HttpError [ Attachments.Attachment ]) +postAttachments settings attachments = + post settings "/attachments" payload True >>= return . eitherDecodeResponse + + where + payload = encode attachments + + postPosts :: T.JSONSettings -> [ Posts.Post ]