Improve parsing

This commit is contained in:
artlef 2018-06-02 18:23:05 +02:00
parent 3a38612d1d
commit 01cc700706

View File

@ -1,4 +1,4 @@
module ParseHtml (printDef, printDifferentDef) where
module ParseHtml (printDef, printDifferentDef, filterSupTags, findRelevantTags) where
import Text.HTML.TagSoup
import Prelude
@ -54,10 +54,18 @@ parseFullDefTags :: T.Text -> [Tag String]
parseFullDefTags = findRelevantTags . parseTags . T.unpack
findRelevantTags :: [Tag String] -> [Tag String]
findRelevantTags [] = []
findRelevantTags (x:xs) = if x ~== "<span class=tlf_cdefinition"
|| x ~== "<span class=tlf_cexemple"
then maybe (f xs) (\y -> y:(f xs)) (L.find (~== TagText "") xs)
else (f xs)
findRelevantTags tags =
filter (~== TagText "") filteredTags
where
filteredTags =
takeWhile (~/= "<div id=footer>") $
dropWhile (~/= "<span class=tlf_cdefinition>") $
filterSupTags tags
filterSupTags :: [Tag String] -> [Tag String]
filterSupTags tags = filter (/= TagText "willBeDeleted") $ snd $ L.mapAccumL transformSupText (TagText "") tags
where
f = findRelevantTags
transformSupText x y
| y ~== "<sup>" = (TagText "willBeDeleted", TagText "willBeDeleted")
| x == TagText "willBeDeleted" = (TagText "", TagText "willBeDeleted")
| otherwise = (x,y)