Attempt to use tagsoup but it doesn't let you traverse the doc

- It's a primitive lexical parser that gives a list of tag
  subcomponents, we'd need a parser on top of it to parse the body.
  But there's probably a lib that let's us work with html already.
This commit is contained in:
towards-a-new-leftypol 2024-03-27 22:56:23 -04:00
parent 644011dd23
commit b0bcc8f835
3 changed files with 38 additions and 7 deletions

View File

@ -49,6 +49,7 @@ import qualified Common.Component.CatalogGrid as Grid
import qualified Common.Component.TimeControl as TC import qualified Common.Component.TimeControl as TC
import Common.FrontEnd.Action (GetThreadArgs (..)) import Common.FrontEnd.Action (GetThreadArgs (..))
import qualified Common.Component.Thread.Model as Thread import qualified Common.Component.Thread.Model as Thread
import qualified Common.Component.ThreadView as Thread
import Common.Network.SiteType (Site) import Common.Network.SiteType (Site)
data HtmlPage a = forall b. (ToJSON b) => HtmlPage (JSONSettings, b, a) data HtmlPage a = forall b. (ToJSON b) => HtmlPage (JSONSettings, b, a)
@ -174,11 +175,14 @@ threadView settings website board_pathpart board_thread_id = do
case thread_results of case thread_results of
Left err -> throwError $ err500 { errBody = fromString $ show err } Left err -> throwError $ err500 { errBody = fromString $ show err }
Right site -> pure $ render now $ head site Right site -> do
let s = head site
posts_and_bodies <- liftIO $ Thread.getPostWithBodies s
pure $ render posts_and_bodies now s
where where
render :: UTCTime -> Site -> HtmlPage (View FE.Action) render :: [ Thread.PostWithBody ] -> UTCTime -> Site -> HtmlPage (View FE.Action)
render t site = render posts_and_bodies t site =
HtmlPage HtmlPage
( settings ( settings
, site , site
@ -200,7 +204,7 @@ threadView settings website board_pathpart board_thread_id = do
thread_model = Thread.Model thread_model = Thread.Model
{ Thread.site = site { Thread.site = site
, Thread.media_root = pack $ media_root settings , Thread.media_root = pack $ media_root settings
, Thread.post_bodies = [] , Thread.post_bodies = posts_and_bodies
, Thread.current_time = t , Thread.current_time = t
} }

View File

@ -1,3 +1,5 @@
{-# LANGUAGE OverloadedStrings #-}
module Parsing.BodyParser module Parsing.BodyParser
( PostPart (..) ( PostPart (..)
, parsePostBody , parsePostBody
@ -5,10 +7,34 @@ module Parsing.BodyParser
, Backlinks , Backlinks
) where ) where
import Data.Text (Text) import Data.Text (Text, unpack)
import Common.Parsing.PostPartType import Common.Parsing.PostPartType
import Common.Parsing.PostBodyUtils import Common.Parsing.PostBodyUtils
import Common.Parsing.QuoteLinkParser
tagsToPostParts :: [ Tag Text ] -> [ PostPart ]
tagsToPostParts = concatMap tagToPostPart
tagToPostPart :: Tag Text -> [ PostPart ]
tagToPostPart (TagText t) = SimpleText t : []
tagToPostPart (TagOpen "a" attrs) = parseAnchor attrs : []
tagToPostPart (TagOpen "span" attrs) = parseSpan attrs : []
parseAnchor :: [ Attribute Text ] -> PostPart
parseAnchor attrs =
case lookup "href" attrs of
Nothing -> SimpleText "Anchor without href"
Just href ->
case lookup "target" attrs of
Just "_blank" -> PostedUrl href
_ -> Quote $ parseURL $ unpack href
parseSpan :: [ Attribute Text ] -> PostPart
parsePostBody :: Text -> IO [ PostPart ] parsePostBody :: Text -> IO [ PostPart ]
parsePostBody = undefined parsePostBody html =
return $ tagsToPostParts $ parseTags html

View File

@ -122,7 +122,8 @@ executable chandlr-server
http-types, http-types,
cmdargs, cmdargs,
mtl, mtl,
utf8-string utf8-string,
tagsoup
-- Directories containing source files. -- Directories containing source files.
hs-source-dirs: app hs-source-dirs: app