osdict_project/spider/normalutils/spider.py

80 lines
2.3 KiB
Python

from typing import Callable, Optional
from fake_useragent import UserAgent
import httpx
from validator import Validator
import httpchoices
class NoValidator(Validator):
def is_valid(self, raise_error):
return True
class Spider:
validator_class: Validator = NoValidator
parser: Callable[[str], dict] = None
def __init__(
self,
url,
method="GET",
request_data: Optional[dict] = None,
params: Optional[dict] = None,
) -> None:
self.useragent = UserAgent()
self.headers = {"User-Agent": self.useragent.random}
self.__data = {}
self.__html = ""
self.url = url
self.method = method
self.has_verified = False
httpchoices.HttpMethod.is_valid(method, True)
self.__request_parameters = {
"data": request_data,
"params": params,
}
def get_parser(self):
assert self.parser is not None
return self.__class__.parser
async def __get_html(self) -> str:
if self.__html != "":
return self.__html
async with httpx.AsyncClient() as client:
try:
response = await client.request(
self.method,
self.url,
headers=self.headers,
**self.__request_parameters
)
response.raise_for_status()
self.__html = response.text
except httpx.HTTPStatusError:
pass
return self.__html
async def __get_data(self) -> dict:
if self.__data == {} or self.__data == []:
html = await self.__get_html()
self.__data = self.get_parser()(html)
return self.__data
async def is_valid(self, raise_exception=False) -> bool:
data = await self.__get_data()
validator_class = self.validator_class(data)
ans = validator_class.is_valid(raise_exception)
if ans:
self.has_verified = True
return ans
async def data(self) -> dict:
if self.has_verified:
return self.__data
else:
await self.is_valid(True)
return self.__data