From 4443dceb4df862b26f4e419e30b5a0a3b80e97d2 Mon Sep 17 00:00:00 2001 From: towards-a-new-leftypol Date: Sun, 31 Mar 2024 16:04:15 -0400 Subject: [PATCH] Implement BodyParser using html-parse - based on implementation from front-end, but that one uses the DOM api --- app/Common | 2 +- app/Main.hs | 12 ++-- app/Parsing/BodyParser.hs | 119 +++++++++++++++++++++++++++++++++- app/Parsing/BodyParserTest.hs | 87 +++++++++++++++++++++++++ chandlr-server.cabal | 3 +- default.nix | 2 + nix-support/html-parse.nix | 16 +++++ 7 files changed, 234 insertions(+), 7 deletions(-) create mode 100644 app/Parsing/BodyParserTest.hs create mode 100644 nix-support/html-parse.nix diff --git a/app/Common b/app/Common index 6b210bf..583c6b6 160000 --- a/app/Common +++ b/app/Common @@ -1 +1 @@ -Subproject commit 6b210bf185141b4716e06937c1b93a233bac1fcd +Subproject commit 583c6b637a6b3938ec81a3265ead91d8f6012847 diff --git a/app/Main.hs b/app/Main.hs index ca1e7d3..f576ef1 100644 --- a/app/Main.hs +++ b/app/Main.hs @@ -49,6 +49,7 @@ import qualified Common.Component.CatalogGrid as Grid import qualified Common.Component.TimeControl as TC import Common.FrontEnd.Action (GetThreadArgs (..)) import qualified Common.Component.Thread.Model as Thread +import qualified Common.Component.ThreadView as Thread import Common.Network.SiteType (Site) data HtmlPage a = forall b. (ToJSON b) => HtmlPage (JSONSettings, b, a) @@ -174,11 +175,14 @@ threadView settings website board_pathpart board_thread_id = do case thread_results of Left err -> throwError $ err500 { errBody = fromString $ show err } - Right site -> pure $ render now $ head site + Right site -> do + let s = head site + posts_and_bodies <- liftIO $ Thread.getPostWithBodies s + pure $ render posts_and_bodies now s where - render :: UTCTime -> Site -> HtmlPage (View FE.Action) - render t site = + render :: [ Thread.PostWithBody ] -> UTCTime -> Site -> HtmlPage (View FE.Action) + render posts_and_bodies t site = HtmlPage ( settings , site @@ -200,7 +204,7 @@ threadView settings website board_pathpart board_thread_id = do thread_model = Thread.Model { Thread.site = site , Thread.media_root = pack $ media_root settings - , Thread.post_bodies = [] + , Thread.post_bodies = posts_and_bodies , Thread.current_time = t } diff --git a/app/Parsing/BodyParser.hs b/app/Parsing/BodyParser.hs index 0bd38d9..a561706 100644 --- a/app/Parsing/BodyParser.hs +++ b/app/Parsing/BodyParser.hs @@ -1,3 +1,5 @@ +{-# LANGUAGE OverloadedStrings #-} + module Parsing.BodyParser ( PostPart (..) , parsePostBody @@ -6,9 +8,124 @@ module Parsing.BodyParser ) where import Data.Text (Text) +import qualified Data.Text as Text +import Text.HTML.Parser + ( parseTokens + , canonicalizeTokens + , Token (..) + , Attr(..) + ) +import Text.HTML.Tree (tokensToForest) +import Data.Tree (Forest, Tree (..)) import Common.Parsing.PostPartType +import Common.Parsing.QuoteLinkParser import Common.Parsing.PostBodyUtils + +getAttr :: Text -> [ Attr ] -> Maybe Text +getAttr _ [] = Nothing +getAttr attrName (Attr x y:xs) + | x == attrName = Just y + | otherwise = getAttr attrName xs + + parsePostBody :: Text -> IO [ PostPart ] -parsePostBody = undefined +parsePostBody html = + case tokensToForest $ canonicalizeTokens $ parseTokens html of + Left err -> do + print err + return [] + + Right forest -> return $ forestToPostParts forest + + +forestToPostParts :: Forest Token -> [ PostPart ] +forestToPostParts = concatMap treeToPostParts + + +treeToPostParts :: Tree Token -> [ PostPart ] +treeToPostParts Node { rootLabel = (TagOpen "a" attrs) } = + let m_href = getAttr "href" attrs + in case m_href of + Nothing -> + [ SimpleText "Anchor without href" ] + Just href -> + let target = getAttr "target" attrs + in case target of + Just "_blank" -> + [ PostedUrl href ] + _ -> + [ Quote $ parseURL $ Text.unpack href ] + + +treeToPostParts Node { rootLabel = (TagOpen "span" attrs), subForest } = + maybe [] (:[]) $ foldr foldfunc Nothing classList + + where + classList :: [ Text ] + classList = maybe [] Text.words $ getAttr "class" attrs + + foldfunc :: Text -> Maybe PostPart -> Maybe PostPart + foldfunc cls Nothing = (>>= \p -> Just $ p $ forestToPostParts subForest) $ matchPart cls + foldfunc _ x@(Just _) = x + + matchPart :: Text -> Maybe ([ PostPart ] -> PostPart) + matchPart "quote" = Just GreenText + matchPart "orangeQuote" = Just OrangeText + matchPart "heading" = Just RedText + matchPart "spoiler" = Just Spoiler + matchPart _ = Nothing + +treeToPostParts Node { rootLabel = (TagOpen "em" _), subForest } = + [ Italics $ forestToPostParts subForest ] + +treeToPostParts Node { rootLabel = (TagOpen "strong" _), subForest } = + [ Bold $ forestToPostParts subForest ] + +treeToPostParts Node { rootLabel = (TagOpen "u" _), subForest } = + [ Underlined $ forestToPostParts subForest ] + +treeToPostParts Node { rootLabel = (TagOpen "s" _), subForest } = + [ Strikethrough $ forestToPostParts subForest ] + +treeToPostParts Node { rootLabel = (TagOpen "pre" _), subForest } = + [ Code $ forestToPostParts subForest ] + +treeToPostParts Node { rootLabel = (TagOpen "br" _) } = + [ Skip ] + +treeToPostParts Node { rootLabel = (ContentText txt) } = [ SimpleText txt ] + +treeToPostParts _ = [ Skip ] + +-- Forest == [ Tree Token ] +-- +-- data Tree a = Node { +-- rootLabel :: a, -- ^ label value +-- subForest :: [Tree a] -- ^ zero or more child trees +-- } +-- +-- Tree a == Tree Token +-- +-- data Tree Token = Node { +-- rootLabel :: Token, -- ^ label value +-- subForest :: [Tree Token] -- ^ zero or more child trees +-- } +-- +-- data Token +-- -- | An opening tag. Attribute ordering is arbitrary. Void elements have a 'TagOpen' but no corresponding 'TagClose'. See 'Text.HTML.Tree.nonClosing'. +-- = TagOpen !TagName [Attr] +-- -- | A self-closing tag. +-- | TagSelfClose !TagName [Attr] +-- -- | A closing tag. +-- | TagClose !TagName +-- -- | The content between tags. +-- | ContentText !Text +-- -- | A single character of content +-- | ContentChar !Char +-- -- | Contents of a comment. +-- | Comment !Builder +-- -- | Doctype +-- | Doctype !Text +-- deriving (Show, Ord, Eq, Generic) diff --git a/app/Parsing/BodyParserTest.hs b/app/Parsing/BodyParserTest.hs new file mode 100644 index 0000000..088b673 --- /dev/null +++ b/app/Parsing/BodyParserTest.hs @@ -0,0 +1,87 @@ +{-# LANGUAGE OverloadedStrings #-} + +module Parsing.BodyParser where + +import Data.Text (Text) +import qualified Data.Text.IO as Text +import Text.HTML.Parser (parseTokens, canonicalizeTokens, Token (..), Attr(..)) +import Text.HTML.Tree (tokensToForest) +import Data.Tree (Forest, Tree (..)) + +getAttr :: Text -> [ Attr ] -> Maybe Text +getAttr _ [] = Nothing +getAttr attrName (Attr x y:xs) + | x == attrName = Just y + | otherwise = getAttr attrName xs + +parsePostBody :: Text -> IO () +parsePostBody html = + case tokensToForest $ canonicalizeTokens $ parseTokens html of + Left err -> do + print err + + Right forest -> forestToPostParts forest + + +forestToPostParts :: Forest Token -> IO () +forestToPostParts = mapM_ treeToPostParts + +treeToPostParts :: Tree Token -> IO () +treeToPostParts Node { rootLabel = tok@(TagOpen "a" attrs) } = + do + putStrLn "Anchor Tag Open" + print attrs + print $ getAttr "href" attrs + +treeToPostParts Node { rootLabel = tok@(TagSelfClose tagname attrs) } = + do + putStrLn "Tag Self-Close" + putStrLn $ "tagname: " ++ show tagname + putStrLn $ "attrs: " ++ show attrs + +-- treeToPostParts Node { rootLabel = (ContentText txt) } = Text.putStrLn txt + +treeToPostParts node@Node { rootLabel } = + do + putStrLn $ "Something Else: " ++ show rootLabel + +-- Forest == [ Tree Token ] +-- +-- data Tree a = Node { +-- rootLabel :: a, -- ^ label value +-- subForest :: [Tree a] -- ^ zero or more child trees +-- } +-- +-- Tree a == Tree Token +-- +-- data Tree Token = Node { +-- rootLabel :: Token, -- ^ label value +-- subForest :: [Tree Token] -- ^ zero or more child trees +-- } +-- +-- and what is a Token? We might need to make a toy program with this +-- +-- data Token +-- -- | An opening tag. Attribute ordering is arbitrary. Void elements have a 'TagOpen' but no corresponding 'TagClose'. See 'Text.HTML.Tree.nonClosing'. +-- = TagOpen !TagName [Attr] +-- -- | A self-closing tag. +-- | TagSelfClose !TagName [Attr] +-- -- | A closing tag. +-- | TagClose !TagName +-- -- | The content between tags. +-- | ContentText !Text +-- -- | A single character of content +-- | ContentChar !Char +-- -- | Contents of a comment. +-- | Comment !Builder +-- -- | Doctype +-- | Doctype !Text +-- deriving (Show, Ord, Eq, Generic) + +-- Add your main method here +main :: IO () +main = do + -- Read the contents of the file + fileContents <- Text.readFile "/home/phil/Documents/haskell/chandlr-miso/html/body.html" + + parsePostBody fileContents diff --git a/chandlr-server.cabal b/chandlr-server.cabal index 06588fd..f186c7b 100644 --- a/chandlr-server.cabal +++ b/chandlr-server.cabal @@ -122,7 +122,8 @@ executable chandlr-server http-types, cmdargs, mtl, - utf8-string + utf8-string, + html-parse -- Directories containing source files. hs-source-dirs: app diff --git a/default.nix b/default.nix index 714c2e8..6fd4496 100644 --- a/default.nix +++ b/default.nix @@ -2,9 +2,11 @@ let http-conduit = import ./app/Common/nix-support/http-conduit.nix { inherit nixpkgs; }; + html-parse = import ./nix-support/html-parse.nix { inherit nixpkgs; }; drv = nixpkgs.haskellPackages.callCabal2nix "chandlr-server" ./. { http-conduit = http-conduit.http-conduit; + html-parse = html-parse; }; env = drv.env.overrideAttrs (oldAttrs: { diff --git a/nix-support/html-parse.nix b/nix-support/html-parse.nix new file mode 100644 index 0000000..a778713 --- /dev/null +++ b/nix-support/html-parse.nix @@ -0,0 +1,16 @@ +{ nixpkgs ? import {} }: + +let + haskellPackages = nixpkgs.haskellPackages; + + src = nixpkgs.fetchFromGitHub { + owner = "bgamari"; + repo = "html-parse"; + rev = "fcbdfe4ae4da14e9af0e21fabfe2da178f041970"; + sha256 = "sha256-naCRlbDGtcCBDiHOPsZwQ3dzVbFtkC1dr5r6ejdewiI="; + }; + + html-parse = haskellPackages.callCabal2nix "html-parse" src {}; +in + + html-parse