import logging from fake_useragent import UserAgent import httpx from parsel import Selector from htmlmin import minify def get_content(text: str) -> str: selector = Selector(text) selector = selector.css(".results-content") selector.css("#examples").remove() selector.css("#webTrans").remove() selector.css("script").remove() selector.css("style").remove() selector.css("a").remove() selector.css(".img-list").remove() return selector.get() class WordSpider: def __init__(self, word: str) -> None: self.useragent = UserAgent() self.headers = {"User-Agent": self.useragent.random} # self.headers = {} self.data = [] self.word = word self.__html = '' self.url = "http://www.youdao.com/w/eng/{}/".format(word) self.success = None def get_html(self) -> str: try: response = httpx.get(self.url, headers=self.headers) response.raise_for_status() except httpx.HTTPStatusError as e: logging.error("fail. " + e) self.success = False self.__html = minify(response.text, True, True) self.success = True return self.__html @property def html(self) -> str: if self.__html == "": self.get_html() return self.__html def parse_page(self): return get_content(self.html) if __name__ == "__main__": word = WordSpider("a") print(word.parse_page())