Implement BodyParser using html-parse

- based on implementation from front-end, but that one uses the DOM api
This commit is contained in:
towards-a-new-leftypol 2024-03-31 16:04:15 -04:00
parent 644011dd23
commit 4443dceb4d
7 changed files with 234 additions and 7 deletions

@ -1 +1 @@
Subproject commit 6b210bf185141b4716e06937c1b93a233bac1fcd Subproject commit 583c6b637a6b3938ec81a3265ead91d8f6012847

View File

@ -49,6 +49,7 @@ import qualified Common.Component.CatalogGrid as Grid
import qualified Common.Component.TimeControl as TC import qualified Common.Component.TimeControl as TC
import Common.FrontEnd.Action (GetThreadArgs (..)) import Common.FrontEnd.Action (GetThreadArgs (..))
import qualified Common.Component.Thread.Model as Thread import qualified Common.Component.Thread.Model as Thread
import qualified Common.Component.ThreadView as Thread
import Common.Network.SiteType (Site) import Common.Network.SiteType (Site)
data HtmlPage a = forall b. (ToJSON b) => HtmlPage (JSONSettings, b, a) data HtmlPage a = forall b. (ToJSON b) => HtmlPage (JSONSettings, b, a)
@ -174,11 +175,14 @@ threadView settings website board_pathpart board_thread_id = do
case thread_results of case thread_results of
Left err -> throwError $ err500 { errBody = fromString $ show err } Left err -> throwError $ err500 { errBody = fromString $ show err }
Right site -> pure $ render now $ head site Right site -> do
let s = head site
posts_and_bodies <- liftIO $ Thread.getPostWithBodies s
pure $ render posts_and_bodies now s
where where
render :: UTCTime -> Site -> HtmlPage (View FE.Action) render :: [ Thread.PostWithBody ] -> UTCTime -> Site -> HtmlPage (View FE.Action)
render t site = render posts_and_bodies t site =
HtmlPage HtmlPage
( settings ( settings
, site , site
@ -200,7 +204,7 @@ threadView settings website board_pathpart board_thread_id = do
thread_model = Thread.Model thread_model = Thread.Model
{ Thread.site = site { Thread.site = site
, Thread.media_root = pack $ media_root settings , Thread.media_root = pack $ media_root settings
, Thread.post_bodies = [] , Thread.post_bodies = posts_and_bodies
, Thread.current_time = t , Thread.current_time = t
} }

View File

@ -1,3 +1,5 @@
{-# LANGUAGE OverloadedStrings #-}
module Parsing.BodyParser module Parsing.BodyParser
( PostPart (..) ( PostPart (..)
, parsePostBody , parsePostBody
@ -6,9 +8,124 @@ module Parsing.BodyParser
) where ) where
import Data.Text (Text) import Data.Text (Text)
import qualified Data.Text as Text
import Text.HTML.Parser
( parseTokens
, canonicalizeTokens
, Token (..)
, Attr(..)
)
import Text.HTML.Tree (tokensToForest)
import Data.Tree (Forest, Tree (..))
import Common.Parsing.PostPartType import Common.Parsing.PostPartType
import Common.Parsing.QuoteLinkParser
import Common.Parsing.PostBodyUtils import Common.Parsing.PostBodyUtils
getAttr :: Text -> [ Attr ] -> Maybe Text
getAttr _ [] = Nothing
getAttr attrName (Attr x y:xs)
| x == attrName = Just y
| otherwise = getAttr attrName xs
parsePostBody :: Text -> IO [ PostPart ] parsePostBody :: Text -> IO [ PostPart ]
parsePostBody = undefined parsePostBody html =
case tokensToForest $ canonicalizeTokens $ parseTokens html of
Left err -> do
print err
return []
Right forest -> return $ forestToPostParts forest
forestToPostParts :: Forest Token -> [ PostPart ]
forestToPostParts = concatMap treeToPostParts
treeToPostParts :: Tree Token -> [ PostPart ]
treeToPostParts Node { rootLabel = (TagOpen "a" attrs) } =
let m_href = getAttr "href" attrs
in case m_href of
Nothing ->
[ SimpleText "Anchor without href" ]
Just href ->
let target = getAttr "target" attrs
in case target of
Just "_blank" ->
[ PostedUrl href ]
_ ->
[ Quote $ parseURL $ Text.unpack href ]
treeToPostParts Node { rootLabel = (TagOpen "span" attrs), subForest } =
maybe [] (:[]) $ foldr foldfunc Nothing classList
where
classList :: [ Text ]
classList = maybe [] Text.words $ getAttr "class" attrs
foldfunc :: Text -> Maybe PostPart -> Maybe PostPart
foldfunc cls Nothing = (>>= \p -> Just $ p $ forestToPostParts subForest) $ matchPart cls
foldfunc _ x@(Just _) = x
matchPart :: Text -> Maybe ([ PostPart ] -> PostPart)
matchPart "quote" = Just GreenText
matchPart "orangeQuote" = Just OrangeText
matchPart "heading" = Just RedText
matchPart "spoiler" = Just Spoiler
matchPart _ = Nothing
treeToPostParts Node { rootLabel = (TagOpen "em" _), subForest } =
[ Italics $ forestToPostParts subForest ]
treeToPostParts Node { rootLabel = (TagOpen "strong" _), subForest } =
[ Bold $ forestToPostParts subForest ]
treeToPostParts Node { rootLabel = (TagOpen "u" _), subForest } =
[ Underlined $ forestToPostParts subForest ]
treeToPostParts Node { rootLabel = (TagOpen "s" _), subForest } =
[ Strikethrough $ forestToPostParts subForest ]
treeToPostParts Node { rootLabel = (TagOpen "pre" _), subForest } =
[ Code $ forestToPostParts subForest ]
treeToPostParts Node { rootLabel = (TagOpen "br" _) } =
[ Skip ]
treeToPostParts Node { rootLabel = (ContentText txt) } = [ SimpleText txt ]
treeToPostParts _ = [ Skip ]
-- Forest == [ Tree Token ]
--
-- data Tree a = Node {
-- rootLabel :: a, -- ^ label value
-- subForest :: [Tree a] -- ^ zero or more child trees
-- }
--
-- Tree a == Tree Token
--
-- data Tree Token = Node {
-- rootLabel :: Token, -- ^ label value
-- subForest :: [Tree Token] -- ^ zero or more child trees
-- }
--
-- data Token
-- -- | An opening tag. Attribute ordering is arbitrary. Void elements have a 'TagOpen' but no corresponding 'TagClose'. See 'Text.HTML.Tree.nonClosing'.
-- = TagOpen !TagName [Attr]
-- -- | A self-closing tag.
-- | TagSelfClose !TagName [Attr]
-- -- | A closing tag.
-- | TagClose !TagName
-- -- | The content between tags.
-- | ContentText !Text
-- -- | A single character of content
-- | ContentChar !Char
-- -- | Contents of a comment.
-- | Comment !Builder
-- -- | Doctype
-- | Doctype !Text
-- deriving (Show, Ord, Eq, Generic)

View File

@ -0,0 +1,87 @@
{-# LANGUAGE OverloadedStrings #-}
module Parsing.BodyParser where
import Data.Text (Text)
import qualified Data.Text.IO as Text
import Text.HTML.Parser (parseTokens, canonicalizeTokens, Token (..), Attr(..))
import Text.HTML.Tree (tokensToForest)
import Data.Tree (Forest, Tree (..))
getAttr :: Text -> [ Attr ] -> Maybe Text
getAttr _ [] = Nothing
getAttr attrName (Attr x y:xs)
| x == attrName = Just y
| otherwise = getAttr attrName xs
parsePostBody :: Text -> IO ()
parsePostBody html =
case tokensToForest $ canonicalizeTokens $ parseTokens html of
Left err -> do
print err
Right forest -> forestToPostParts forest
forestToPostParts :: Forest Token -> IO ()
forestToPostParts = mapM_ treeToPostParts
treeToPostParts :: Tree Token -> IO ()
treeToPostParts Node { rootLabel = tok@(TagOpen "a" attrs) } =
do
putStrLn "Anchor Tag Open"
print attrs
print $ getAttr "href" attrs
treeToPostParts Node { rootLabel = tok@(TagSelfClose tagname attrs) } =
do
putStrLn "Tag Self-Close"
putStrLn $ "tagname: " ++ show tagname
putStrLn $ "attrs: " ++ show attrs
-- treeToPostParts Node { rootLabel = (ContentText txt) } = Text.putStrLn txt
treeToPostParts node@Node { rootLabel } =
do
putStrLn $ "Something Else: " ++ show rootLabel
-- Forest == [ Tree Token ]
--
-- data Tree a = Node {
-- rootLabel :: a, -- ^ label value
-- subForest :: [Tree a] -- ^ zero or more child trees
-- }
--
-- Tree a == Tree Token
--
-- data Tree Token = Node {
-- rootLabel :: Token, -- ^ label value
-- subForest :: [Tree Token] -- ^ zero or more child trees
-- }
--
-- and what is a Token? We might need to make a toy program with this
--
-- data Token
-- -- | An opening tag. Attribute ordering is arbitrary. Void elements have a 'TagOpen' but no corresponding 'TagClose'. See 'Text.HTML.Tree.nonClosing'.
-- = TagOpen !TagName [Attr]
-- -- | A self-closing tag.
-- | TagSelfClose !TagName [Attr]
-- -- | A closing tag.
-- | TagClose !TagName
-- -- | The content between tags.
-- | ContentText !Text
-- -- | A single character of content
-- | ContentChar !Char
-- -- | Contents of a comment.
-- | Comment !Builder
-- -- | Doctype
-- | Doctype !Text
-- deriving (Show, Ord, Eq, Generic)
-- Add your main method here
main :: IO ()
main = do
-- Read the contents of the file
fileContents <- Text.readFile "/home/phil/Documents/haskell/chandlr-miso/html/body.html"
parsePostBody fileContents

View File

@ -122,7 +122,8 @@ executable chandlr-server
http-types, http-types,
cmdargs, cmdargs,
mtl, mtl,
utf8-string utf8-string,
html-parse
-- Directories containing source files. -- Directories containing source files.
hs-source-dirs: app hs-source-dirs: app

View File

@ -2,9 +2,11 @@
let let
http-conduit = import ./app/Common/nix-support/http-conduit.nix { inherit nixpkgs; }; http-conduit = import ./app/Common/nix-support/http-conduit.nix { inherit nixpkgs; };
html-parse = import ./nix-support/html-parse.nix { inherit nixpkgs; };
drv = nixpkgs.haskellPackages.callCabal2nix "chandlr-server" ./. { drv = nixpkgs.haskellPackages.callCabal2nix "chandlr-server" ./. {
http-conduit = http-conduit.http-conduit; http-conduit = http-conduit.http-conduit;
html-parse = html-parse;
}; };
env = drv.env.overrideAttrs (oldAttrs: { env = drv.env.overrideAttrs (oldAttrs: {

View File

@ -0,0 +1,16 @@
{ nixpkgs ? import <nixpkgs> {} }:
let
haskellPackages = nixpkgs.haskellPackages;
src = nixpkgs.fetchFromGitHub {
owner = "bgamari";
repo = "html-parse";
rev = "fcbdfe4ae4da14e9af0e21fabfe2da178f041970";
sha256 = "sha256-naCRlbDGtcCBDiHOPsZwQ3dzVbFtkC1dr5r6ejdewiI=";
};
html-parse = haskellPackages.callCabal2nix "html-parse" src {};
in
html-parse