osdict_project/spider/wordspider.py

57 lines
1.5 KiB
Python

import logging
from fake_useragent import UserAgent
import httpx
from parsel import Selector
from htmlmin import minify
def get_content(text: str) -> str:
selector = Selector(text)
selector = selector.css(".results-content")
selector.css("#examples").remove()
selector.css("#webTrans").remove()
selector.css("script").remove()
selector.css("style").remove()
selector.css("a").remove()
selector.css(".img-list").remove()
return selector.get()
class WordSpider:
def __init__(self, word: str) -> None:
self.useragent = UserAgent()
self.headers = {"User-Agent": self.useragent.random}
# self.headers = {}
self.data = []
self.word = word
self.__html = ''
self.url = "http://www.youdao.com/w/eng/{}/".format(word)
self.success = None
def get_html(self) -> str:
try:
response = httpx.get(self.url, headers=self.headers)
response.raise_for_status()
except httpx.HTTPStatusError as e:
logging.error("fail. " + e)
self.success = False
self.__html = minify(response.text, True, True)
self.success = True
return self.__html
@property
def html(self) -> str:
if self.__html == "":
self.get_html()
return self.__html
def parse_page(self):
return get_content(self.html)
if __name__ == "__main__":
word = WordSpider("a")
print(word.parse_page())