Compare commits

..

1 Commits

Author SHA1 Message Date
towards-a-new-leftypol b0bcc8f835 Attempt to use tagsoup but it doesn't let you traverse the doc
- It's a primitive lexical parser that gives a list of tag
  subcomponents, we'd need a parser on top of it to parse the body.
  But there's probably a lib that let's us work with html already.
2024-03-27 22:56:23 -04:00
6 changed files with 24 additions and 220 deletions

@ -1 +1 @@
Subproject commit 583c6b637a6b3938ec81a3265ead91d8f6012847
Subproject commit 6b210bf185141b4716e06937c1b93a233bac1fcd

View File

@ -7,125 +7,34 @@ module Parsing.BodyParser
, Backlinks
) where
import Data.Text (Text)
import qualified Data.Text as Text
import Text.HTML.Parser
( parseTokens
, canonicalizeTokens
, Token (..)
, Attr(..)
)
import Text.HTML.Tree (tokensToForest)
import Data.Tree (Forest, Tree (..))
import Data.Text (Text, unpack)
import Common.Parsing.PostPartType
import Common.Parsing.QuoteLinkParser
import Common.Parsing.PostBodyUtils
import Common.Parsing.QuoteLinkParser
tagsToPostParts :: [ Tag Text ] -> [ PostPart ]
tagsToPostParts = concatMap tagToPostPart
getAttr :: Text -> [ Attr ] -> Maybe Text
getAttr _ [] = Nothing
getAttr attrName (Attr x y:xs)
| x == attrName = Just y
| otherwise = getAttr attrName xs
tagToPostPart :: Tag Text -> [ PostPart ]
tagToPostPart (TagText t) = SimpleText t : []
tagToPostPart (TagOpen "a" attrs) = parseAnchor attrs : []
tagToPostPart (TagOpen "span" attrs) = parseSpan attrs : []
parseAnchor :: [ Attribute Text ] -> PostPart
parseAnchor attrs =
case lookup "href" attrs of
Nothing -> SimpleText "Anchor without href"
Just href ->
case lookup "target" attrs of
Just "_blank" -> PostedUrl href
_ -> Quote $ parseURL $ unpack href
parseSpan :: [ Attribute Text ] -> PostPart
parsePostBody :: Text -> IO [ PostPart ]
parsePostBody html =
case tokensToForest $ canonicalizeTokens $ parseTokens html of
Left err -> do
print err
return []
Right forest -> return $ forestToPostParts forest
forestToPostParts :: Forest Token -> [ PostPart ]
forestToPostParts = concatMap treeToPostParts
treeToPostParts :: Tree Token -> [ PostPart ]
treeToPostParts Node { rootLabel = (TagOpen "a" attrs) } =
let m_href = getAttr "href" attrs
in case m_href of
Nothing ->
[ SimpleText "Anchor without href" ]
Just href ->
let target = getAttr "target" attrs
in case target of
Just "_blank" ->
[ PostedUrl href ]
_ ->
[ Quote $ parseURL $ Text.unpack href ]
treeToPostParts Node { rootLabel = (TagOpen "span" attrs), subForest } =
maybe [] (:[]) $ foldr foldfunc Nothing classList
where
classList :: [ Text ]
classList = maybe [] Text.words $ getAttr "class" attrs
foldfunc :: Text -> Maybe PostPart -> Maybe PostPart
foldfunc cls Nothing = (>>= \p -> Just $ p $ forestToPostParts subForest) $ matchPart cls
foldfunc _ x@(Just _) = x
matchPart :: Text -> Maybe ([ PostPart ] -> PostPart)
matchPart "quote" = Just GreenText
matchPart "orangeQuote" = Just OrangeText
matchPart "heading" = Just RedText
matchPart "spoiler" = Just Spoiler
matchPart _ = Nothing
treeToPostParts Node { rootLabel = (TagOpen "em" _), subForest } =
[ Italics $ forestToPostParts subForest ]
treeToPostParts Node { rootLabel = (TagOpen "strong" _), subForest } =
[ Bold $ forestToPostParts subForest ]
treeToPostParts Node { rootLabel = (TagOpen "u" _), subForest } =
[ Underlined $ forestToPostParts subForest ]
treeToPostParts Node { rootLabel = (TagOpen "s" _), subForest } =
[ Strikethrough $ forestToPostParts subForest ]
treeToPostParts Node { rootLabel = (TagOpen "pre" _), subForest } =
[ Code $ forestToPostParts subForest ]
treeToPostParts Node { rootLabel = (TagOpen "br" _) } =
[ Skip ]
treeToPostParts Node { rootLabel = (ContentText txt) } = [ SimpleText txt ]
treeToPostParts _ = [ Skip ]
-- Forest == [ Tree Token ]
--
-- data Tree a = Node {
-- rootLabel :: a, -- ^ label value
-- subForest :: [Tree a] -- ^ zero or more child trees
-- }
--
-- Tree a == Tree Token
--
-- data Tree Token = Node {
-- rootLabel :: Token, -- ^ label value
-- subForest :: [Tree Token] -- ^ zero or more child trees
-- }
--
-- data Token
-- -- | An opening tag. Attribute ordering is arbitrary. Void elements have a 'TagOpen' but no corresponding 'TagClose'. See 'Text.HTML.Tree.nonClosing'.
-- = TagOpen !TagName [Attr]
-- -- | A self-closing tag.
-- | TagSelfClose !TagName [Attr]
-- -- | A closing tag.
-- | TagClose !TagName
-- -- | The content between tags.
-- | ContentText !Text
-- -- | A single character of content
-- | ContentChar !Char
-- -- | Contents of a comment.
-- | Comment !Builder
-- -- | Doctype
-- | Doctype !Text
-- deriving (Show, Ord, Eq, Generic)
return $ tagsToPostParts $ parseTags html

View File

@ -1,87 +0,0 @@
{-# LANGUAGE OverloadedStrings #-}
module Parsing.BodyParser where
import Data.Text (Text)
import qualified Data.Text.IO as Text
import Text.HTML.Parser (parseTokens, canonicalizeTokens, Token (..), Attr(..))
import Text.HTML.Tree (tokensToForest)
import Data.Tree (Forest, Tree (..))
getAttr :: Text -> [ Attr ] -> Maybe Text
getAttr _ [] = Nothing
getAttr attrName (Attr x y:xs)
| x == attrName = Just y
| otherwise = getAttr attrName xs
parsePostBody :: Text -> IO ()
parsePostBody html =
case tokensToForest $ canonicalizeTokens $ parseTokens html of
Left err -> do
print err
Right forest -> forestToPostParts forest
forestToPostParts :: Forest Token -> IO ()
forestToPostParts = mapM_ treeToPostParts
treeToPostParts :: Tree Token -> IO ()
treeToPostParts Node { rootLabel = tok@(TagOpen "a" attrs) } =
do
putStrLn "Anchor Tag Open"
print attrs
print $ getAttr "href" attrs
treeToPostParts Node { rootLabel = tok@(TagSelfClose tagname attrs) } =
do
putStrLn "Tag Self-Close"
putStrLn $ "tagname: " ++ show tagname
putStrLn $ "attrs: " ++ show attrs
-- treeToPostParts Node { rootLabel = (ContentText txt) } = Text.putStrLn txt
treeToPostParts node@Node { rootLabel } =
do
putStrLn $ "Something Else: " ++ show rootLabel
-- Forest == [ Tree Token ]
--
-- data Tree a = Node {
-- rootLabel :: a, -- ^ label value
-- subForest :: [Tree a] -- ^ zero or more child trees
-- }
--
-- Tree a == Tree Token
--
-- data Tree Token = Node {
-- rootLabel :: Token, -- ^ label value
-- subForest :: [Tree Token] -- ^ zero or more child trees
-- }
--
-- and what is a Token? We might need to make a toy program with this
--
-- data Token
-- -- | An opening tag. Attribute ordering is arbitrary. Void elements have a 'TagOpen' but no corresponding 'TagClose'. See 'Text.HTML.Tree.nonClosing'.
-- = TagOpen !TagName [Attr]
-- -- | A self-closing tag.
-- | TagSelfClose !TagName [Attr]
-- -- | A closing tag.
-- | TagClose !TagName
-- -- | The content between tags.
-- | ContentText !Text
-- -- | A single character of content
-- | ContentChar !Char
-- -- | Contents of a comment.
-- | Comment !Builder
-- -- | Doctype
-- | Doctype !Text
-- deriving (Show, Ord, Eq, Generic)
-- Add your main method here
main :: IO ()
main = do
-- Read the contents of the file
fileContents <- Text.readFile "/home/phil/Documents/haskell/chandlr-miso/html/body.html"
parsePostBody fileContents

View File

@ -123,7 +123,7 @@ executable chandlr-server
cmdargs,
mtl,
utf8-string,
html-parse
tagsoup
-- Directories containing source files.
hs-source-dirs: app

View File

@ -2,11 +2,9 @@
let
http-conduit = import ./app/Common/nix-support/http-conduit.nix { inherit nixpkgs; };
html-parse = import ./nix-support/html-parse.nix { inherit nixpkgs; };
drv = nixpkgs.haskellPackages.callCabal2nix "chandlr-server" ./. {
http-conduit = http-conduit.http-conduit;
html-parse = html-parse;
};
env = drv.env.overrideAttrs (oldAttrs: {

View File

@ -1,16 +0,0 @@
{ nixpkgs ? import <nixpkgs> {} }:
let
haskellPackages = nixpkgs.haskellPackages;
src = nixpkgs.fetchFromGitHub {
owner = "bgamari";
repo = "html-parse";
rev = "fcbdfe4ae4da14e9af0e21fabfe2da178f041970";
sha256 = "sha256-naCRlbDGtcCBDiHOPsZwQ3dzVbFtkC1dr5r6ejdewiI=";
};
html-parse = haskellPackages.callCabal2nix "html-parse" src {};
in
html-parse