Parse post body using DOM

- add module for parsing post body
- creates and element and sets the innerhtml of it to a string
- then iterates over the nodes that were created to
  get them into a data structure
- re-use data-structure from bunkerchan-upload script
    - parsing logic is similar but uses ghcjs-dom API instead of HXT
This commit is contained in:
towards-a-new-leftypol 2024-02-09 05:38:06 -05:00
parent 388c9a5b28
commit 3b1d130f41
2 changed files with 152 additions and 0 deletions

View File

@ -77,6 +77,7 @@ executable chandlr
Component.Thread.Intro
Routes
Common.AttachmentType
BodyParser
-- LANGUAGE extensions used by modules in this package.

151
src/BodyParser.hs Normal file
View File

@ -0,0 +1,151 @@
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE PackageImports #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE MultiWayIf #-}
module BodyParser
( PostPart (..)
, parsePostBody
) where
import Data.Maybe (catMaybes)
import GHCJS.DOM (currentDocument)
import GHCJS.DOM.Types
( Element (..)
, JSString
, NodeList
, uncheckedCastTo
)
import GHCJS.DOM.JSFFI.Generated.Document
import GHCJS.DOM.JSFFI.Generated.Element
import GHCJS.DOM.JSFFI.Generated.Node hiding (contains)
import qualified GHCJS.DOM.JSFFI.Generated.NodeList as NodeList
import GHCJS.DOM.JSFFI.Generated.DOMTokenList (contains)
import Data.Text (Text)
data PostPart
= SimpleText JSString
| PostedUrl JSString
| Skip
| Quote JSString
-- Quotes don't seem to be able to be spoilered
-- board links (which appear as quotes but start with >>>) break the tag
| GreenText [ PostPart ]
| OrangeText [ PostPart ]
| RedText [ PostPart ]
| Spoiler [ PostPart ]
-- you can't seem to spoiler greentext
| Bold [ PostPart ]
| Underlined [ PostPart ]
| Italics [ PostPart ]
| Strikethrough [ PostPart ]
deriving Show
nodeListToList :: NodeList -> IO [ Node ]
nodeListToList l = do
len <- NodeList.getLength l
nodes <- mapM (NodeList.item l) [0..len-1]
return $ catMaybes nodes
-- | Parse the HTML string and add event handlers to certain elements
parsePostBody :: Text -> IO [ PostPart ]
parsePostBody htmlString = do
Just doc <- currentDocument
container <- createElement doc ("div" :: Text)
-- Set the innerHTML of the container to the HTML string
setInnerHTML container htmlString
-- Iterate over the newly created elements in the container
children <- getChildNodes container
parseNodeList children
toPostPart :: Node -> IO PostPart
toPostPart node = do
node_type <- getNodeType node
toPostPart_ node_type node
toPostPart_ :: Word -> Node -> IO PostPart
toPostPart_ node_type node
| node_type == TEXT_NODE =
getTextContentUnchecked node >>= return . SimpleText
| node_type == ELEMENT_NODE = do
tagName :: JSString <- getTagName element
case tagName of
"a" -> parseAnchor element
"span" -> parseSpan element
"em" -> parseEm element
"strong" -> parseStrong element
"u" -> parseU element
"s" -> parseS element
_ -> return $ SimpleText "Unsupported element"
| otherwise = return Skip
where
element = uncheckedCastTo Element node
parseAnchor :: Element -> IO PostPart
parseAnchor element = do
m_href :: Maybe JSString <- getAttribute element ("href" :: JSString)
case m_href of
Nothing -> return $ SimpleText "Anchor without href"
Just href -> do
target <- getAttribute element ("target" :: JSString)
case target of
Just ("_blank" :: JSString) -> return $ PostedUrl href
_ -> return $ Quote href
parseSpan :: Element -> IO PostPart
parseSpan element = do
classList <- getClassList element
quote <- contains classList ("quote" :: JSString)
orangeQuote <- contains classList ("orangeQuote" :: JSString)
heading <- contains classList ("heading" :: JSString)
spoiler <- contains classList ("spoiler" :: JSString)
if | quote -> parseChildNodes element >>= return . GreenText
| orangeQuote -> parseChildNodes element >>= return . OrangeText
| heading -> parseChildNodes element >>= return . RedText
| spoiler -> parseChildNodes element >>= return . Spoiler
| otherwise -> return $ SimpleText "Unsupported span class"
parseNodeList :: NodeList -> IO [PostPart]
parseNodeList nodes = nodeListToList nodes >>= mapM toPostPart
parseChildNodes :: Element -> IO [PostPart]
parseChildNodes element = getChildNodes element >>= parseNodeList
parseEm :: Element -> IO PostPart
parseEm element
= parseChildNodes element
>>= return . Italics
parseStrong :: Element -> IO PostPart
parseStrong element
= parseChildNodes element
>>= return . Bold
parseU :: Element -> IO PostPart
parseU element
= parseChildNodes element
>>= return . Underlined
parseS :: Element -> IO PostPart
parseS element
= parseChildNodes element
>>= return . Strikethrough