80 lines
2.3 KiB
Python
80 lines
2.3 KiB
Python
from typing import Callable, Optional
|
|
|
|
from fake_useragent import UserAgent
|
|
import httpx
|
|
|
|
from validator import Validator
|
|
import httpchoices
|
|
|
|
|
|
class NoValidator(Validator):
|
|
def is_valid(self, raise_error):
|
|
return True
|
|
|
|
|
|
class Spider:
|
|
validator_class: Validator = NoValidator
|
|
parser: Callable[[str], dict] = None
|
|
|
|
def __init__(
|
|
self,
|
|
url,
|
|
method="GET",
|
|
request_data: Optional[dict] = None,
|
|
params: Optional[dict] = None,
|
|
) -> None:
|
|
self.useragent = UserAgent()
|
|
self.headers = {"User-Agent": self.useragent.random}
|
|
self.__data = {}
|
|
self.__html = ""
|
|
self.url = url
|
|
self.method = method
|
|
self.has_verified = False
|
|
httpchoices.HttpMethod.is_valid(method, True)
|
|
self.__request_parameters = {
|
|
"data": request_data,
|
|
"params": params,
|
|
}
|
|
|
|
def get_parser(self):
|
|
assert self.parser is not None
|
|
return self.__class__.parser
|
|
|
|
async def __get_html(self) -> str:
|
|
if self.__html != "":
|
|
return self.__html
|
|
async with httpx.AsyncClient() as client:
|
|
try:
|
|
response = await client.request(
|
|
self.method,
|
|
self.url,
|
|
headers=self.headers,
|
|
**self.__request_parameters
|
|
)
|
|
response.raise_for_status()
|
|
self.__html = response.text
|
|
except httpx.HTTPStatusError:
|
|
pass
|
|
return self.__html
|
|
|
|
async def __get_data(self) -> dict:
|
|
if self.__data == {} or self.__data == []:
|
|
html = await self.__get_html()
|
|
self.__data = self.get_parser()(html)
|
|
return self.__data
|
|
|
|
async def is_valid(self, raise_exception=False) -> bool:
|
|
data = await self.__get_data()
|
|
validator_class = self.validator_class(data)
|
|
ans = validator_class.is_valid(raise_exception)
|
|
if ans:
|
|
self.has_verified = True
|
|
return ans
|
|
|
|
async def data(self) -> dict:
|
|
if self.has_verified:
|
|
return self.__data
|
|
else:
|
|
await self.is_valid(True)
|
|
return self.__data
|