57 lines
1.5 KiB
Python
57 lines
1.5 KiB
Python
import logging
|
|
from fake_useragent import UserAgent
|
|
import httpx
|
|
from parsel import Selector
|
|
from htmlmin import minify
|
|
|
|
|
|
def get_content(text: str) -> str:
|
|
selector = Selector(text)
|
|
selector = selector.css(".results-content")
|
|
selector.css("#examples").remove()
|
|
selector.css("#webTrans").remove()
|
|
|
|
selector.css("script").remove()
|
|
selector.css("style").remove()
|
|
selector.css("a").remove()
|
|
|
|
selector.css(".img-list").remove()
|
|
return selector.get()
|
|
|
|
|
|
class WordSpider:
|
|
def __init__(self, word: str) -> None:
|
|
self.useragent = UserAgent()
|
|
self.headers = {"User-Agent": self.useragent.random}
|
|
# self.headers = {}
|
|
self.data = []
|
|
self.word = word
|
|
self.__html = ''
|
|
self.url = "http://www.youdao.com/w/eng/{}/".format(word)
|
|
self.success = None
|
|
|
|
def get_html(self) -> str:
|
|
try:
|
|
response = httpx.get(self.url, headers=self.headers)
|
|
response.raise_for_status()
|
|
except httpx.HTTPStatusError as e:
|
|
logging.error("fail. " + e)
|
|
self.success = False
|
|
self.__html = minify(response.text, True, True)
|
|
self.success = True
|
|
return self.__html
|
|
|
|
@property
|
|
def html(self) -> str:
|
|
if self.__html == "":
|
|
self.get_html()
|
|
return self.__html
|
|
|
|
def parse_page(self):
|
|
return get_content(self.html)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
word = WordSpider("a")
|
|
print(word.parse_page())
|