dictfr/ParseHtml.hs

64 lines
2.1 KiB
Haskell

module ParseHtml (printDef, printDifferentDef) where
import Text.HTML.TagSoup
import Prelude
import WordDef
import qualified Data.List as L
import qualified Data.Text.Lazy as T
printDifferentDef :: T.Text -> IO ()
printDifferentDef x = putStrLn (getDifferentDef x)
getDifferentDef :: T.Text -> String
getDifferentDef x = getDifferentDefMessages (length xs) ++ renderWordList xs
where xs = (parseDefTags . differentDefTags) x
getDifferentDefMessages :: Int -> String
getDifferentDefMessages 0 = "Aucune définition disponible."
getDifferentDefMessages 1 = "Une définition disponible : \n"
getDifferentDefMessages _ = "Plusieurs définitions disponibles : \n"
parseDefTags :: [Tag String] -> [WordDefHeader]
parseDefTags [] = []
parseDefTags xs =
(WordDefHeader (WordName (renderTags (take 1 (fst st))))
(WordType (renderTags [(fst st) !! 1])))
: (parseDefTags (snd st))
where st = subTags xs
subTags :: [Tag String] -> ([Tag String],[Tag String])
subTags [] = ([],[])
subTags xs = if (length xs) `mod` 3 /= 0
then splitAt 2 xs
else ((take 1 xs) ++ (take 1 (drop 2 xs)) , drop 3 xs)
differentDefTags :: T.Text -> [Tag String]
differentDefTags =
filter (~== TagText "") . (takeWhile (~/= "<div id=contentbox>"))
. (dropWhile (~/= "<div id=vtoolbar>")) . parseTags . T.unpack
printDef :: T.Text -> Int -> IO ()
printDef t n = putStrLn (getDef t n)
getDef :: T.Text -> Int -> String
getDef t n = renderFullDef (getFullDef t n)
getFullDef :: T.Text -> Int -> WordFullDef
getFullDef t n = WordFullDef header defList
where
header = ((parseDefTags . differentDefTags) t !! (n-1))
defList = map (WordDefSentence . renderTags) [parseFullDefTags t]
parseFullDefTags :: T.Text -> [Tag String]
parseFullDefTags = findRelevantTags . parseTags . T.unpack
findRelevantTags :: [Tag String] -> [Tag String]
findRelevantTags [] = []
findRelevantTags (x:xs) = if x ~== "<span class=tlf_cdefinition"
|| x ~== "<span class=tlf_cexemple"
then maybe (f xs) (\y -> y:(f xs)) (L.find (~== TagText "") xs)
else (f xs)
where
f = findRelevantTags