Add a new config to scrap description from babelio
This commit is contained in:
128
internal/babelio/babelio.go
Normal file
128
internal/babelio/babelio.go
Normal file
@@ -0,0 +1,128 @@
|
||||
package babelio
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
|
||||
"git.artlef.fr/bibliomane/internal/callapiutils"
|
||||
"git.artlef.fr/bibliomane/internal/myvalidator"
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
"golang.org/x/text/encoding/charmap"
|
||||
)
|
||||
|
||||
type babelioSearchArg struct {
|
||||
Term string `json:"term"`
|
||||
}
|
||||
|
||||
type babelioSearchResult struct {
|
||||
//only parsing the url
|
||||
Url string `json:"url"`
|
||||
}
|
||||
|
||||
func GetDescriptionFromISBN(baseUrl string, isbn string) (string, error) {
|
||||
url, err := searchPageIsbn(baseUrl, isbn)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
//we either find the full summary, or we have to make another call to get it.
|
||||
fullSummary, payloadToQuery, err := parseBookPage(baseUrl, url)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
if fullSummary != "" {
|
||||
return decodeAndCleanText(strings.NewReader(fullSummary)), err
|
||||
} else if payloadToQuery != "" {
|
||||
return queryDescription(baseUrl, payloadToQuery)
|
||||
} else {
|
||||
return "", nil
|
||||
}
|
||||
}
|
||||
|
||||
func searchPageIsbn(baseUrl, isbn string) (string, error) {
|
||||
searchUrl, err := callapiutils.ComputeUrl(baseUrl, "aj_recherche.php")
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
term := babelioSearchArg{Term: isbn}
|
||||
var searchResults []babelioSearchResult
|
||||
callapiutils.FetchAndParseResultFromPost(searchUrl, &term, &searchResults)
|
||||
if len(searchResults) == 0 {
|
||||
return "", myvalidator.TranslatedError{Err: errors.New("ISBNNotFoundBabelio")}
|
||||
}
|
||||
|
||||
return searchResults[0].Url, nil
|
||||
}
|
||||
|
||||
func parseBookPage(baseUrl, bookUrl string) (string, string, error) {
|
||||
|
||||
url, err := callapiutils.ComputeUrl(baseUrl, bookUrl)
|
||||
if err != nil {
|
||||
return "", "", err
|
||||
}
|
||||
resp, err := http.Get(url.String())
|
||||
if err != nil {
|
||||
return "", "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
doc, err := goquery.NewDocumentFromReader(resp.Body)
|
||||
//we either find the full summary, or we have to make another call to get it.
|
||||
fullsummary := ""
|
||||
jsToParse := ""
|
||||
doc.Find(".livre_resume").Each(func(i int, s *goquery.Selection) {
|
||||
onclick, ok := s.Find("a").Attr("onclick")
|
||||
if ok {
|
||||
jsToParse = onclick
|
||||
} else {
|
||||
fullsummary = s.Text()
|
||||
}
|
||||
})
|
||||
if fullsummary != "" {
|
||||
return fullsummary, "", nil
|
||||
}
|
||||
typeStr, idObj, err := extractNumbersFromExpression(jsToParse)
|
||||
if err != nil {
|
||||
return "", "", err
|
||||
}
|
||||
return "", fmt.Sprintf("type=%s&id_obj=%s", typeStr, idObj), nil
|
||||
}
|
||||
|
||||
func extractNumbersFromExpression(jsToParse string) (string, string, error) {
|
||||
splitted := strings.Split(jsToParse, ",")
|
||||
if len(splitted) < 3 {
|
||||
return "", "", myvalidator.TranslatedError{Err: errors.New("BabelioParseError")}
|
||||
}
|
||||
if len(splitted[2]) < 3 {
|
||||
return "", "", myvalidator.TranslatedError{Err: errors.New("BabelioParseError")}
|
||||
}
|
||||
return splitted[1], splitted[2][:len(splitted[2])-2], nil
|
||||
}
|
||||
|
||||
func queryDescription(baseUrl string, payloadToQuery string) (string, error) {
|
||||
url, err := callapiutils.ComputeUrl(baseUrl, "aj_voir_plus_a.php")
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
resp, err := http.Post(url.String(),
|
||||
"application/x-www-form-urlencoded; charset=UTF-8",
|
||||
strings.NewReader(payloadToQuery))
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return "", myvalidator.TranslatedError{Err: fmt.Errorf("BabelioFetchDescError")}
|
||||
}
|
||||
return decodeAndCleanText(resp.Body), nil
|
||||
}
|
||||
|
||||
func decodeAndCleanText(reader io.Reader) string {
|
||||
tr := charmap.Windows1252.NewDecoder().Reader(reader)
|
||||
var decodedString strings.Builder
|
||||
io.Copy(&decodedString, tr)
|
||||
return strings.TrimSpace(strings.ReplaceAll(decodedString.String(), "<br>", "\n"))
|
||||
}
|
||||
Reference in New Issue
Block a user