Implement BodyParser using html-parse
- based on implementation from front-end, but that one uses the DOM api
This commit is contained in:
parent
644011dd23
commit
4443dceb4d
|
@ -1 +1 @@
|
||||||
Subproject commit 6b210bf185141b4716e06937c1b93a233bac1fcd
|
Subproject commit 583c6b637a6b3938ec81a3265ead91d8f6012847
|
12
app/Main.hs
12
app/Main.hs
|
@ -49,6 +49,7 @@ import qualified Common.Component.CatalogGrid as Grid
|
||||||
import qualified Common.Component.TimeControl as TC
|
import qualified Common.Component.TimeControl as TC
|
||||||
import Common.FrontEnd.Action (GetThreadArgs (..))
|
import Common.FrontEnd.Action (GetThreadArgs (..))
|
||||||
import qualified Common.Component.Thread.Model as Thread
|
import qualified Common.Component.Thread.Model as Thread
|
||||||
|
import qualified Common.Component.ThreadView as Thread
|
||||||
import Common.Network.SiteType (Site)
|
import Common.Network.SiteType (Site)
|
||||||
|
|
||||||
data HtmlPage a = forall b. (ToJSON b) => HtmlPage (JSONSettings, b, a)
|
data HtmlPage a = forall b. (ToJSON b) => HtmlPage (JSONSettings, b, a)
|
||||||
|
@ -174,11 +175,14 @@ threadView settings website board_pathpart board_thread_id = do
|
||||||
|
|
||||||
case thread_results of
|
case thread_results of
|
||||||
Left err -> throwError $ err500 { errBody = fromString $ show err }
|
Left err -> throwError $ err500 { errBody = fromString $ show err }
|
||||||
Right site -> pure $ render now $ head site
|
Right site -> do
|
||||||
|
let s = head site
|
||||||
|
posts_and_bodies <- liftIO $ Thread.getPostWithBodies s
|
||||||
|
pure $ render posts_and_bodies now s
|
||||||
|
|
||||||
where
|
where
|
||||||
render :: UTCTime -> Site -> HtmlPage (View FE.Action)
|
render :: [ Thread.PostWithBody ] -> UTCTime -> Site -> HtmlPage (View FE.Action)
|
||||||
render t site =
|
render posts_and_bodies t site =
|
||||||
HtmlPage
|
HtmlPage
|
||||||
( settings
|
( settings
|
||||||
, site
|
, site
|
||||||
|
@ -200,7 +204,7 @@ threadView settings website board_pathpart board_thread_id = do
|
||||||
thread_model = Thread.Model
|
thread_model = Thread.Model
|
||||||
{ Thread.site = site
|
{ Thread.site = site
|
||||||
, Thread.media_root = pack $ media_root settings
|
, Thread.media_root = pack $ media_root settings
|
||||||
, Thread.post_bodies = []
|
, Thread.post_bodies = posts_and_bodies
|
||||||
, Thread.current_time = t
|
, Thread.current_time = t
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
{-# LANGUAGE OverloadedStrings #-}
|
||||||
|
|
||||||
module Parsing.BodyParser
|
module Parsing.BodyParser
|
||||||
( PostPart (..)
|
( PostPart (..)
|
||||||
, parsePostBody
|
, parsePostBody
|
||||||
|
@ -6,9 +8,124 @@ module Parsing.BodyParser
|
||||||
) where
|
) where
|
||||||
|
|
||||||
import Data.Text (Text)
|
import Data.Text (Text)
|
||||||
|
import qualified Data.Text as Text
|
||||||
|
import Text.HTML.Parser
|
||||||
|
( parseTokens
|
||||||
|
, canonicalizeTokens
|
||||||
|
, Token (..)
|
||||||
|
, Attr(..)
|
||||||
|
)
|
||||||
|
import Text.HTML.Tree (tokensToForest)
|
||||||
|
import Data.Tree (Forest, Tree (..))
|
||||||
|
|
||||||
import Common.Parsing.PostPartType
|
import Common.Parsing.PostPartType
|
||||||
|
import Common.Parsing.QuoteLinkParser
|
||||||
import Common.Parsing.PostBodyUtils
|
import Common.Parsing.PostBodyUtils
|
||||||
|
|
||||||
|
|
||||||
|
getAttr :: Text -> [ Attr ] -> Maybe Text
|
||||||
|
getAttr _ [] = Nothing
|
||||||
|
getAttr attrName (Attr x y:xs)
|
||||||
|
| x == attrName = Just y
|
||||||
|
| otherwise = getAttr attrName xs
|
||||||
|
|
||||||
|
|
||||||
parsePostBody :: Text -> IO [ PostPart ]
|
parsePostBody :: Text -> IO [ PostPart ]
|
||||||
parsePostBody = undefined
|
parsePostBody html =
|
||||||
|
case tokensToForest $ canonicalizeTokens $ parseTokens html of
|
||||||
|
Left err -> do
|
||||||
|
print err
|
||||||
|
return []
|
||||||
|
|
||||||
|
Right forest -> return $ forestToPostParts forest
|
||||||
|
|
||||||
|
|
||||||
|
forestToPostParts :: Forest Token -> [ PostPart ]
|
||||||
|
forestToPostParts = concatMap treeToPostParts
|
||||||
|
|
||||||
|
|
||||||
|
treeToPostParts :: Tree Token -> [ PostPart ]
|
||||||
|
treeToPostParts Node { rootLabel = (TagOpen "a" attrs) } =
|
||||||
|
let m_href = getAttr "href" attrs
|
||||||
|
in case m_href of
|
||||||
|
Nothing ->
|
||||||
|
[ SimpleText "Anchor without href" ]
|
||||||
|
Just href ->
|
||||||
|
let target = getAttr "target" attrs
|
||||||
|
in case target of
|
||||||
|
Just "_blank" ->
|
||||||
|
[ PostedUrl href ]
|
||||||
|
_ ->
|
||||||
|
[ Quote $ parseURL $ Text.unpack href ]
|
||||||
|
|
||||||
|
|
||||||
|
treeToPostParts Node { rootLabel = (TagOpen "span" attrs), subForest } =
|
||||||
|
maybe [] (:[]) $ foldr foldfunc Nothing classList
|
||||||
|
|
||||||
|
where
|
||||||
|
classList :: [ Text ]
|
||||||
|
classList = maybe [] Text.words $ getAttr "class" attrs
|
||||||
|
|
||||||
|
foldfunc :: Text -> Maybe PostPart -> Maybe PostPart
|
||||||
|
foldfunc cls Nothing = (>>= \p -> Just $ p $ forestToPostParts subForest) $ matchPart cls
|
||||||
|
foldfunc _ x@(Just _) = x
|
||||||
|
|
||||||
|
matchPart :: Text -> Maybe ([ PostPart ] -> PostPart)
|
||||||
|
matchPart "quote" = Just GreenText
|
||||||
|
matchPart "orangeQuote" = Just OrangeText
|
||||||
|
matchPart "heading" = Just RedText
|
||||||
|
matchPart "spoiler" = Just Spoiler
|
||||||
|
matchPart _ = Nothing
|
||||||
|
|
||||||
|
treeToPostParts Node { rootLabel = (TagOpen "em" _), subForest } =
|
||||||
|
[ Italics $ forestToPostParts subForest ]
|
||||||
|
|
||||||
|
treeToPostParts Node { rootLabel = (TagOpen "strong" _), subForest } =
|
||||||
|
[ Bold $ forestToPostParts subForest ]
|
||||||
|
|
||||||
|
treeToPostParts Node { rootLabel = (TagOpen "u" _), subForest } =
|
||||||
|
[ Underlined $ forestToPostParts subForest ]
|
||||||
|
|
||||||
|
treeToPostParts Node { rootLabel = (TagOpen "s" _), subForest } =
|
||||||
|
[ Strikethrough $ forestToPostParts subForest ]
|
||||||
|
|
||||||
|
treeToPostParts Node { rootLabel = (TagOpen "pre" _), subForest } =
|
||||||
|
[ Code $ forestToPostParts subForest ]
|
||||||
|
|
||||||
|
treeToPostParts Node { rootLabel = (TagOpen "br" _) } =
|
||||||
|
[ Skip ]
|
||||||
|
|
||||||
|
treeToPostParts Node { rootLabel = (ContentText txt) } = [ SimpleText txt ]
|
||||||
|
|
||||||
|
treeToPostParts _ = [ Skip ]
|
||||||
|
|
||||||
|
-- Forest == [ Tree Token ]
|
||||||
|
--
|
||||||
|
-- data Tree a = Node {
|
||||||
|
-- rootLabel :: a, -- ^ label value
|
||||||
|
-- subForest :: [Tree a] -- ^ zero or more child trees
|
||||||
|
-- }
|
||||||
|
--
|
||||||
|
-- Tree a == Tree Token
|
||||||
|
--
|
||||||
|
-- data Tree Token = Node {
|
||||||
|
-- rootLabel :: Token, -- ^ label value
|
||||||
|
-- subForest :: [Tree Token] -- ^ zero or more child trees
|
||||||
|
-- }
|
||||||
|
--
|
||||||
|
-- data Token
|
||||||
|
-- -- | An opening tag. Attribute ordering is arbitrary. Void elements have a 'TagOpen' but no corresponding 'TagClose'. See 'Text.HTML.Tree.nonClosing'.
|
||||||
|
-- = TagOpen !TagName [Attr]
|
||||||
|
-- -- | A self-closing tag.
|
||||||
|
-- | TagSelfClose !TagName [Attr]
|
||||||
|
-- -- | A closing tag.
|
||||||
|
-- | TagClose !TagName
|
||||||
|
-- -- | The content between tags.
|
||||||
|
-- | ContentText !Text
|
||||||
|
-- -- | A single character of content
|
||||||
|
-- | ContentChar !Char
|
||||||
|
-- -- | Contents of a comment.
|
||||||
|
-- | Comment !Builder
|
||||||
|
-- -- | Doctype
|
||||||
|
-- | Doctype !Text
|
||||||
|
-- deriving (Show, Ord, Eq, Generic)
|
||||||
|
|
|
@ -0,0 +1,87 @@
|
||||||
|
{-# LANGUAGE OverloadedStrings #-}
|
||||||
|
|
||||||
|
module Parsing.BodyParser where
|
||||||
|
|
||||||
|
import Data.Text (Text)
|
||||||
|
import qualified Data.Text.IO as Text
|
||||||
|
import Text.HTML.Parser (parseTokens, canonicalizeTokens, Token (..), Attr(..))
|
||||||
|
import Text.HTML.Tree (tokensToForest)
|
||||||
|
import Data.Tree (Forest, Tree (..))
|
||||||
|
|
||||||
|
getAttr :: Text -> [ Attr ] -> Maybe Text
|
||||||
|
getAttr _ [] = Nothing
|
||||||
|
getAttr attrName (Attr x y:xs)
|
||||||
|
| x == attrName = Just y
|
||||||
|
| otherwise = getAttr attrName xs
|
||||||
|
|
||||||
|
parsePostBody :: Text -> IO ()
|
||||||
|
parsePostBody html =
|
||||||
|
case tokensToForest $ canonicalizeTokens $ parseTokens html of
|
||||||
|
Left err -> do
|
||||||
|
print err
|
||||||
|
|
||||||
|
Right forest -> forestToPostParts forest
|
||||||
|
|
||||||
|
|
||||||
|
forestToPostParts :: Forest Token -> IO ()
|
||||||
|
forestToPostParts = mapM_ treeToPostParts
|
||||||
|
|
||||||
|
treeToPostParts :: Tree Token -> IO ()
|
||||||
|
treeToPostParts Node { rootLabel = tok@(TagOpen "a" attrs) } =
|
||||||
|
do
|
||||||
|
putStrLn "Anchor Tag Open"
|
||||||
|
print attrs
|
||||||
|
print $ getAttr "href" attrs
|
||||||
|
|
||||||
|
treeToPostParts Node { rootLabel = tok@(TagSelfClose tagname attrs) } =
|
||||||
|
do
|
||||||
|
putStrLn "Tag Self-Close"
|
||||||
|
putStrLn $ "tagname: " ++ show tagname
|
||||||
|
putStrLn $ "attrs: " ++ show attrs
|
||||||
|
|
||||||
|
-- treeToPostParts Node { rootLabel = (ContentText txt) } = Text.putStrLn txt
|
||||||
|
|
||||||
|
treeToPostParts node@Node { rootLabel } =
|
||||||
|
do
|
||||||
|
putStrLn $ "Something Else: " ++ show rootLabel
|
||||||
|
|
||||||
|
-- Forest == [ Tree Token ]
|
||||||
|
--
|
||||||
|
-- data Tree a = Node {
|
||||||
|
-- rootLabel :: a, -- ^ label value
|
||||||
|
-- subForest :: [Tree a] -- ^ zero or more child trees
|
||||||
|
-- }
|
||||||
|
--
|
||||||
|
-- Tree a == Tree Token
|
||||||
|
--
|
||||||
|
-- data Tree Token = Node {
|
||||||
|
-- rootLabel :: Token, -- ^ label value
|
||||||
|
-- subForest :: [Tree Token] -- ^ zero or more child trees
|
||||||
|
-- }
|
||||||
|
--
|
||||||
|
-- and what is a Token? We might need to make a toy program with this
|
||||||
|
--
|
||||||
|
-- data Token
|
||||||
|
-- -- | An opening tag. Attribute ordering is arbitrary. Void elements have a 'TagOpen' but no corresponding 'TagClose'. See 'Text.HTML.Tree.nonClosing'.
|
||||||
|
-- = TagOpen !TagName [Attr]
|
||||||
|
-- -- | A self-closing tag.
|
||||||
|
-- | TagSelfClose !TagName [Attr]
|
||||||
|
-- -- | A closing tag.
|
||||||
|
-- | TagClose !TagName
|
||||||
|
-- -- | The content between tags.
|
||||||
|
-- | ContentText !Text
|
||||||
|
-- -- | A single character of content
|
||||||
|
-- | ContentChar !Char
|
||||||
|
-- -- | Contents of a comment.
|
||||||
|
-- | Comment !Builder
|
||||||
|
-- -- | Doctype
|
||||||
|
-- | Doctype !Text
|
||||||
|
-- deriving (Show, Ord, Eq, Generic)
|
||||||
|
|
||||||
|
-- Add your main method here
|
||||||
|
main :: IO ()
|
||||||
|
main = do
|
||||||
|
-- Read the contents of the file
|
||||||
|
fileContents <- Text.readFile "/home/phil/Documents/haskell/chandlr-miso/html/body.html"
|
||||||
|
|
||||||
|
parsePostBody fileContents
|
|
@ -122,7 +122,8 @@ executable chandlr-server
|
||||||
http-types,
|
http-types,
|
||||||
cmdargs,
|
cmdargs,
|
||||||
mtl,
|
mtl,
|
||||||
utf8-string
|
utf8-string,
|
||||||
|
html-parse
|
||||||
|
|
||||||
-- Directories containing source files.
|
-- Directories containing source files.
|
||||||
hs-source-dirs: app
|
hs-source-dirs: app
|
||||||
|
|
|
@ -2,9 +2,11 @@
|
||||||
|
|
||||||
let
|
let
|
||||||
http-conduit = import ./app/Common/nix-support/http-conduit.nix { inherit nixpkgs; };
|
http-conduit = import ./app/Common/nix-support/http-conduit.nix { inherit nixpkgs; };
|
||||||
|
html-parse = import ./nix-support/html-parse.nix { inherit nixpkgs; };
|
||||||
|
|
||||||
drv = nixpkgs.haskellPackages.callCabal2nix "chandlr-server" ./. {
|
drv = nixpkgs.haskellPackages.callCabal2nix "chandlr-server" ./. {
|
||||||
http-conduit = http-conduit.http-conduit;
|
http-conduit = http-conduit.http-conduit;
|
||||||
|
html-parse = html-parse;
|
||||||
};
|
};
|
||||||
|
|
||||||
env = drv.env.overrideAttrs (oldAttrs: {
|
env = drv.env.overrideAttrs (oldAttrs: {
|
||||||
|
|
|
@ -0,0 +1,16 @@
|
||||||
|
{ nixpkgs ? import <nixpkgs> {} }:
|
||||||
|
|
||||||
|
let
|
||||||
|
haskellPackages = nixpkgs.haskellPackages;
|
||||||
|
|
||||||
|
src = nixpkgs.fetchFromGitHub {
|
||||||
|
owner = "bgamari";
|
||||||
|
repo = "html-parse";
|
||||||
|
rev = "fcbdfe4ae4da14e9af0e21fabfe2da178f041970";
|
||||||
|
sha256 = "sha256-naCRlbDGtcCBDiHOPsZwQ3dzVbFtkC1dr5r6ejdewiI=";
|
||||||
|
};
|
||||||
|
|
||||||
|
html-parse = haskellPackages.callCabal2nix "html-parse" src {};
|
||||||
|
in
|
||||||
|
|
||||||
|
html-parse
|
Loading…
Reference in New Issue