init
This commit is contained in:
0
spider/normalutils/__init__.py
Normal file
0
spider/normalutils/__init__.py
Normal file
59
spider/normalutils/choices/__init__.py
Normal file
59
spider/normalutils/choices/__init__.py
Normal file
@ -0,0 +1,59 @@
|
||||
# from userscontent.models import ContentType
|
||||
import enum
|
||||
from typing import Iterator, Union
|
||||
|
||||
|
||||
class BaseChoices(enum.Enum):
|
||||
@classmethod
|
||||
def is_valid(cls, value: str, raise_exception: bool = False) -> bool:
|
||||
answer = isinstance(value, str)
|
||||
if not answer:
|
||||
if raise_exception:
|
||||
raise TypeError("The type of 'value' is wrong.")
|
||||
else:
|
||||
return False
|
||||
answer = value in cls.choices()
|
||||
if not raise_exception or answer:
|
||||
return answer
|
||||
else:
|
||||
raise ValueError(
|
||||
"The class '{}' does not have {}".format(cls.__name__, value)
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def choices(cls, iter=False) -> Union[tuple, Iterator]:
|
||||
choices = tuple(cls)
|
||||
iterator_obj = map(lambda choice: choice.value, choices)
|
||||
if iter:
|
||||
return iterator_obj
|
||||
else:
|
||||
return tuple(iterator_obj)
|
||||
|
||||
|
||||
class WordPropertyType(BaseChoices):
|
||||
NOUN = "n."
|
||||
PRONOUN = "pron." # 代词
|
||||
ADJECTIVE = "adj."
|
||||
ADVERB = "adv."
|
||||
VERB = "v."
|
||||
NUMBERAL = "num."
|
||||
ARTICLE = "art."
|
||||
PREPOTION = "prep."
|
||||
CONJUNCTION = "conj."
|
||||
INTERJECTION = "interj."
|
||||
ABBREVIATION = "abbr."
|
||||
COMBINATION = "comb."
|
||||
SUFFIX = "suff." # 后缀
|
||||
|
||||
|
||||
class StateType(BaseChoices):
|
||||
REFUSED = "rf"
|
||||
CHECKING = "ck"
|
||||
PUBLISHED = "pb"
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print(list(StateType.choices()))
|
||||
print(StateType.is_valid(1))
|
||||
print(StateType.is_valid("refuse"))
|
||||
print(StateType.is_valid("rf"))
|
17
spider/normalutils/choices/htmlchoices.py
Normal file
17
spider/normalutils/choices/htmlchoices.py
Normal file
@ -0,0 +1,17 @@
|
||||
from normalutils.choices import BaseChoices
|
||||
|
||||
|
||||
class HttpMethod(BaseChoices):
|
||||
GET = "GET"
|
||||
POST = "POST"
|
||||
PUT = "PUT"
|
||||
PATCH = "PATCH"
|
||||
DELETE = "DELETE"
|
||||
OPTIONS = "OPOTIONS"
|
||||
|
||||
|
||||
class HtmlContentType(BaseChoices):
|
||||
TEXT_PLAIN = "text/plain"
|
||||
TEXT_HTML = "text/html"
|
||||
TEXT_MARKDOWN = "text/markdown"
|
||||
APPLICATION_JSON = "application/json"
|
79
spider/normalutils/spider.py
Normal file
79
spider/normalutils/spider.py
Normal file
@ -0,0 +1,79 @@
|
||||
from typing import Callable, Optional
|
||||
|
||||
from fake_useragent import UserAgent
|
||||
import httpx
|
||||
|
||||
from validator import Validator
|
||||
import httpchoices
|
||||
|
||||
|
||||
class NoValidator(Validator):
|
||||
def is_valid(self, raise_error):
|
||||
return True
|
||||
|
||||
|
||||
class Spider:
|
||||
validator_class: Validator = NoValidator
|
||||
parser: Callable[[str], dict] = None
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
url,
|
||||
method="GET",
|
||||
request_data: Optional[dict] = None,
|
||||
params: Optional[dict] = None,
|
||||
) -> None:
|
||||
self.useragent = UserAgent()
|
||||
self.headers = {"User-Agent": self.useragent.random}
|
||||
self.__data = {}
|
||||
self.__html = ""
|
||||
self.url = url
|
||||
self.method = method
|
||||
self.has_verified = False
|
||||
httpchoices.HttpMethod.is_valid(method, True)
|
||||
self.__request_parameters = {
|
||||
"data": request_data,
|
||||
"params": params,
|
||||
}
|
||||
|
||||
def get_parser(self):
|
||||
assert self.parser is not None
|
||||
return self.__class__.parser
|
||||
|
||||
async def __get_html(self) -> str:
|
||||
if self.__html != "":
|
||||
return self.__html
|
||||
async with httpx.AsyncClient() as client:
|
||||
try:
|
||||
response = await client.request(
|
||||
self.method,
|
||||
self.url,
|
||||
headers=self.headers,
|
||||
**self.__request_parameters
|
||||
)
|
||||
response.raise_for_status()
|
||||
self.__html = response.text
|
||||
except httpx.HTTPStatusError:
|
||||
pass
|
||||
return self.__html
|
||||
|
||||
async def __get_data(self) -> dict:
|
||||
if self.__data == {} or self.__data == []:
|
||||
html = await self.__get_html()
|
||||
self.__data = self.get_parser()(html)
|
||||
return self.__data
|
||||
|
||||
async def is_valid(self, raise_exception=False) -> bool:
|
||||
data = await self.__get_data()
|
||||
validator_class = self.validator_class(data)
|
||||
ans = validator_class.is_valid(raise_exception)
|
||||
if ans:
|
||||
self.has_verified = True
|
||||
return ans
|
||||
|
||||
async def data(self) -> dict:
|
||||
if self.has_verified:
|
||||
return self.__data
|
||||
else:
|
||||
await self.is_valid(True)
|
||||
return self.__data
|
0
spider/normalutils/utils/__init__.py
Normal file
0
spider/normalutils/utils/__init__.py
Normal file
21
spider/normalutils/utils/contenttohtml.py
Normal file
21
spider/normalutils/utils/contenttohtml.py
Normal file
@ -0,0 +1,21 @@
|
||||
from markdown import markdown
|
||||
import html
|
||||
|
||||
from normalutils.choices.htmlchoices import HtmlContentType
|
||||
|
||||
|
||||
def content_to_html(content: str, content_type=HtmlContentType.TEXT_MARKDOWN, title=None):
|
||||
if content_type == HtmlContentType.TEXT_MARKDOWN:
|
||||
content = markdown(content)
|
||||
return content
|
||||
elif content_type == HtmlContentType.TEXT_PLAIN:
|
||||
content = html.escape(content)
|
||||
content = content.split('\n')
|
||||
ret = ''
|
||||
for sentence in content:
|
||||
ret += ''.join(["<p>", sentence, "</p>\n"])
|
||||
# add title
|
||||
if title is not None:
|
||||
title = html.escape(title)
|
||||
return ''.join(['<h1>', title, '</h1>\n', ret])
|
||||
return ret
|
72
spider/normalutils/utils/random.py
Normal file
72
spider/normalutils/utils/random.py
Normal file
@ -0,0 +1,72 @@
|
||||
from typing import Callable
|
||||
import random
|
||||
from functools import wraps
|
||||
|
||||
from django.utils import timezone
|
||||
|
||||
|
||||
def random_str(typename: str, randomlength: int = 16) -> Callable[[None], str]:
|
||||
"""Parameter:
|
||||
----------
|
||||
type: 'common' [A-Za-z0-9]; 'lower' [a-z0-9]"""
|
||||
common = "AaBbCcDdEeFfGgHhJjKkLlMmNnOoPpQqRrSsTtUuVvWwXxYyZz0123456789"
|
||||
lower = "abcdefghijklmnopqrstuvwxyz0123456789"
|
||||
|
||||
if typename == 'common':
|
||||
chars = common
|
||||
elif typename == 'lower':
|
||||
chars = lower
|
||||
else:
|
||||
raise ValueError
|
||||
|
||||
def _do() -> str:
|
||||
length = len(chars) - 1
|
||||
ret = "".join([chars[random.randint(0, length)] for _ in range(randomlength)])
|
||||
return ret
|
||||
|
||||
return _do
|
||||
|
||||
|
||||
def create_random_unique_str(rand_func: Callable[[None], str]):
|
||||
time_string = None
|
||||
list_string = []
|
||||
|
||||
def create_random(rand_func: Callable[[None], str]):
|
||||
timestr = timezone.now().timestamp()
|
||||
timestr = str(int(timestr))
|
||||
ranstr = rand_func()
|
||||
ret = timestr + ranstr
|
||||
return ret, timestr
|
||||
|
||||
def get_unique_str():
|
||||
nonlocal time_string
|
||||
nonlocal list_string
|
||||
while True:
|
||||
ret, timestr = create_random(rand_func)
|
||||
if time_string != timestr:
|
||||
time_string = timestr
|
||||
list_string = [ret]
|
||||
return ret
|
||||
else:
|
||||
if ret not in list_string:
|
||||
list_string.append(ret)
|
||||
return ret
|
||||
|
||||
def decrator_func(func):
|
||||
@wraps(func)
|
||||
def _do():
|
||||
return get_unique_str()
|
||||
return _do
|
||||
|
||||
return decrator_func
|
||||
# return get_unique_str
|
||||
|
||||
|
||||
@create_random_unique_str(random_str('common', 2))
|
||||
def default_nickname() -> str:
|
||||
pass
|
||||
|
||||
|
||||
@create_random_unique_str(random_str('lower', 1))
|
||||
def default_version_unique_id() -> str:
|
||||
pass
|
14
spider/normalutils/utils/timeit.py
Normal file
14
spider/normalutils/utils/timeit.py
Normal file
@ -0,0 +1,14 @@
|
||||
from time import time
|
||||
from functools import wraps
|
||||
|
||||
|
||||
def timeit(func):
|
||||
@wraps(func)
|
||||
def _totime(*args, **kwargs):
|
||||
st = time()
|
||||
ans = func(*args, **kwargs)
|
||||
end = time()
|
||||
print("'{}' use time: {}".format(func.__name__, end - st))
|
||||
return ans
|
||||
|
||||
return _totime
|
13
spider/normalutils/utils/validation.py
Normal file
13
spider/normalutils/utils/validation.py
Normal file
@ -0,0 +1,13 @@
|
||||
from validator import ValidationError
|
||||
|
||||
|
||||
def validate_lenth(value: str, max_length: int, min_length: int = 4):
|
||||
length = len(value)
|
||||
if length > max_length:
|
||||
raise ValidationError(
|
||||
"Length is {}. It is longer than {}".format(length, max_length)
|
||||
)
|
||||
elif length < min_length:
|
||||
raise ValidationError(
|
||||
"Length is {}. It is shorter than {}".format(length, min_length)
|
||||
)
|
Reference in New Issue
Block a user