init
This commit is contained in:
54
spider/action_getwebdata.py
Normal file
54
spider/action_getwebdata.py
Normal file
@ -0,0 +1,54 @@
|
||||
from typing import Sequence
|
||||
import time
|
||||
import logging
|
||||
|
||||
from sqlalchemy.orm import Session
|
||||
from sqlalchemy.sql.expression import update
|
||||
|
||||
from wordspider import WordSpider
|
||||
from models import WordData
|
||||
import models
|
||||
|
||||
|
||||
AMOUNT = 9500
|
||||
|
||||
|
||||
def get_not_retrieve_word_list(session: Session) -> Sequence:
|
||||
queryset = session.query(WordData).filter_by(has_retrieve=False)
|
||||
return queryset
|
||||
|
||||
|
||||
def get_data(session: Session, word_list: Sequence) -> None:
|
||||
for word in word_list:
|
||||
spider = WordSpider(word)
|
||||
text = spider.parse_page()
|
||||
if spider.success:
|
||||
print("*", end="")
|
||||
else:
|
||||
print("F", end="")
|
||||
continue
|
||||
session.execute(
|
||||
update(WordData)
|
||||
.where(WordData.word == word)
|
||||
.values(html=text, has_retrieve=True)
|
||||
)
|
||||
session.commit()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
starttime = time.time()
|
||||
logging.basicConfig(filename="spider.log")
|
||||
with models.Session() as session:
|
||||
ans = get_not_retrieve_word_list(session)
|
||||
|
||||
if ans.count() < AMOUNT:
|
||||
queryset = ans
|
||||
else:
|
||||
queryset = ans[0:AMOUNT]
|
||||
|
||||
word_list = map(lambda item: item.word, queryset)
|
||||
|
||||
get_data(session, word_list)
|
||||
|
||||
endtime = time.time()
|
||||
print(endtime - starttime)
|
43
spider/action_migration.py
Normal file
43
spider/action_migration.py
Normal file
@ -0,0 +1,43 @@
|
||||
import logging
|
||||
|
||||
from normalutils.choices import StateType
|
||||
import omodels as om
|
||||
import models as sm
|
||||
|
||||
|
||||
word_fn = lambda word: {
|
||||
"spelling": word.spelling,
|
||||
"importance": word.importance,
|
||||
"state": StateType.PUBLISHED.value,
|
||||
}
|
||||
meaning_fn = lambda meaning: {
|
||||
"meaning": meaning.meaning,
|
||||
"word_property": meaning.word_property,
|
||||
"state": StateType.PUBLISHED.value
|
||||
}
|
||||
|
||||
|
||||
def migrate_word_meanings(o_session, word: sm.Word):
|
||||
if word.spelling is None:
|
||||
logging.error("word is None. word id is {}".format(word.id))
|
||||
return
|
||||
meanings = word.meanings
|
||||
o_word_dict = word_fn(word)
|
||||
o_word = om.OWord(**o_word_dict)
|
||||
o_session.add(o_word)
|
||||
|
||||
for meaning in meanings:
|
||||
meaningfield = om.OMeaningField(word=o_word)
|
||||
o_session.add(meaningfield)
|
||||
meaning_dict = meaning_fn(meaning)
|
||||
meaning = om.OMeaning(meaningfield=meaningfield, **meaning_dict)
|
||||
o_session.add(meaning)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
with sm.Session() as s_session:
|
||||
words = s_session.query(sm.Word)
|
||||
with om.OSession() as o_session:
|
||||
for word in words:
|
||||
migrate_word_meanings(o_session, word)
|
||||
o_session.commit()
|
14
spider/action_recordword.py
Normal file
14
spider/action_recordword.py
Normal file
@ -0,0 +1,14 @@
|
||||
from models import WordData, Session
|
||||
|
||||
|
||||
words = []
|
||||
with open("./google-10000-english.txt", "r") as f:
|
||||
words = f.readlines()
|
||||
|
||||
with Session() as session:
|
||||
for word in words:
|
||||
newword = WordData(word=word.strip())
|
||||
# newword = WordData(spelling=word.strip())
|
||||
session.add(newword)
|
||||
|
||||
session.commit()
|
46
spider/action_serialize.py
Normal file
46
spider/action_serialize.py
Normal file
@ -0,0 +1,46 @@
|
||||
import logging
|
||||
|
||||
from models import Session, WordData, Word
|
||||
from serializers import WordAddSerializer, MeaningAddSerializer
|
||||
from renderercontents import renderer_word, renderer_meaningslist, has_value_to_render
|
||||
|
||||
|
||||
def create_word_meaning(text, session):
|
||||
word_dict = renderer_word(text)
|
||||
word_serializer = WordAddSerializer(word_dict, session)
|
||||
flag = False
|
||||
try:
|
||||
flag = word_serializer.is_valid(True)
|
||||
except Exception as e:
|
||||
logging.error("msg: {} data: {}".format(e, word_dict))
|
||||
if not flag:
|
||||
return # fail validation
|
||||
if session.query(Word).filter_by(spelling=word_dict['spelling']).count() > 0:
|
||||
return # repeat
|
||||
word = word_serializer.save()
|
||||
if word.spelling is None: # word is null
|
||||
logging.error("word spelling is null word_dict: {}\n{}".format(word_dict, text))
|
||||
try:
|
||||
meaning_list = renderer_meaningslist(text)
|
||||
except Exception as e:
|
||||
logging.error("msg: {} word: {}\n{}".format(e, word.spelling, text))
|
||||
raise e
|
||||
for meaning in meaning_list:
|
||||
meaning_serializer = MeaningAddSerializer(meaning, session, word)
|
||||
flag = False
|
||||
try:
|
||||
flag = meaning_serializer.is_valid(True)
|
||||
except Exception as e:
|
||||
logging.error("msg: {} word: {} data: {}".format(e, word.spelling, meaning))
|
||||
meaning_serializer.save()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.basicConfig(filename="spider.log")
|
||||
with Session() as session:
|
||||
queryset = session.query(WordData).filter_by(has_retrieve=True)
|
||||
texts = map(lambda word: word.html, queryset)
|
||||
for text in texts:
|
||||
if has_value_to_render(text):
|
||||
create_word_meaning(text, session)
|
||||
session.commit()
|
10000
spider/google-10000-english.txt
Normal file
10000
spider/google-10000-english.txt
Normal file
File diff suppressed because it is too large
Load Diff
10
spider/httpchoices.py
Normal file
10
spider/httpchoices.py
Normal file
@ -0,0 +1,10 @@
|
||||
from normalutils.choices import BaseChoices
|
||||
|
||||
|
||||
class HttpMethod(BaseChoices):
|
||||
GET = "GET"
|
||||
POST = "POST"
|
||||
PUT = "PUT"
|
||||
PATCH = "PATCH"
|
||||
DELETE = "DELETE"
|
||||
OPTIONS = "OPOTIONS"
|
86
spider/models.py
Normal file
86
spider/models.py
Normal file
@ -0,0 +1,86 @@
|
||||
from datetime import datetime
|
||||
|
||||
from sqlalchemy import create_engine, Column
|
||||
from sqlalchemy.ext.declarative import declarative_base
|
||||
from sqlalchemy.orm import relationship, sessionmaker
|
||||
from sqlalchemy.sql.schema import ForeignKey
|
||||
from sqlalchemy.sql.sqltypes import Boolean, DateTime, String, Text, Integer
|
||||
|
||||
|
||||
engine = create_engine("sqlite:///spider.sqlite3", future=True)
|
||||
Base = declarative_base()
|
||||
Session = sessionmaker(engine, future=True)
|
||||
|
||||
|
||||
class Word(Base):
|
||||
__tablename__ = "word"
|
||||
|
||||
id = Column(Integer, primary_key=True)
|
||||
spelling = Column(String(64), unique=True)
|
||||
importance = Column("importance", String(32), nullable=True)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return self.spelling
|
||||
|
||||
|
||||
class Meaning(Base):
|
||||
__tablename__ = "meaning"
|
||||
|
||||
id = Column(Integer, primary_key=True)
|
||||
meaning = Column(String(128))
|
||||
word_property = Column(String(8))
|
||||
sentance = Column(String(128), nullable=True)
|
||||
add_time = Column(DateTime, default=datetime.utcnow)
|
||||
word_id = Column(
|
||||
Integer,
|
||||
ForeignKey("{0}.id".format(Word.__tablename__), ondelete="CASCADE"),
|
||||
)
|
||||
|
||||
word = relationship("Word", backref="meanings")
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return "{} {} {}".format(self.word.spelling, self.word_property, self.meaning)
|
||||
|
||||
|
||||
class WordData(Base):
|
||||
__tablename__ = "word_data"
|
||||
|
||||
id = Column(Integer, primary_key=True)
|
||||
word = Column(String(64), unique=True, nullable=False)
|
||||
has_retrieve = Column(Boolean, default=False, nullable=False)
|
||||
url = Column(String(256), nullable=True)
|
||||
html = Column(Text, nullable=True)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return "'{}' {}".format(self.word, self.has_retrieve)
|
||||
|
||||
|
||||
def clear_word_null():
|
||||
with Session() as session:
|
||||
queryset = session.query(Word).filter_by(spelling=None)
|
||||
for item in queryset:
|
||||
session.delete(item)
|
||||
session.commit()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Base.metadata.create_all(engine)
|
||||
def testcase():
|
||||
with Session() as session:
|
||||
the = session.query(Word).filter_by(spelling="the").one()
|
||||
print(the.meanings)
|
||||
|
||||
# testcase()
|
||||
def testcase2():
|
||||
with Session() as session:
|
||||
nonecase = session.query(Word).filter_by(spelling=None)
|
||||
print(nonecase.count())
|
||||
|
||||
def testcase3():
|
||||
with Session() as session:
|
||||
john = session.query(Word).filter_by(spelling="john").one()
|
||||
print(john.importance)
|
||||
print(type(john.importance)) # => str something is wrong
|
||||
|
||||
# clear_word_null()
|
||||
testcase3()
|
0
spider/normalutils/__init__.py
Normal file
0
spider/normalutils/__init__.py
Normal file
59
spider/normalutils/choices/__init__.py
Normal file
59
spider/normalutils/choices/__init__.py
Normal file
@ -0,0 +1,59 @@
|
||||
# from userscontent.models import ContentType
|
||||
import enum
|
||||
from typing import Iterator, Union
|
||||
|
||||
|
||||
class BaseChoices(enum.Enum):
|
||||
@classmethod
|
||||
def is_valid(cls, value: str, raise_exception: bool = False) -> bool:
|
||||
answer = isinstance(value, str)
|
||||
if not answer:
|
||||
if raise_exception:
|
||||
raise TypeError("The type of 'value' is wrong.")
|
||||
else:
|
||||
return False
|
||||
answer = value in cls.choices()
|
||||
if not raise_exception or answer:
|
||||
return answer
|
||||
else:
|
||||
raise ValueError(
|
||||
"The class '{}' does not have {}".format(cls.__name__, value)
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def choices(cls, iter=False) -> Union[tuple, Iterator]:
|
||||
choices = tuple(cls)
|
||||
iterator_obj = map(lambda choice: choice.value, choices)
|
||||
if iter:
|
||||
return iterator_obj
|
||||
else:
|
||||
return tuple(iterator_obj)
|
||||
|
||||
|
||||
class WordPropertyType(BaseChoices):
|
||||
NOUN = "n."
|
||||
PRONOUN = "pron." # 代词
|
||||
ADJECTIVE = "adj."
|
||||
ADVERB = "adv."
|
||||
VERB = "v."
|
||||
NUMBERAL = "num."
|
||||
ARTICLE = "art."
|
||||
PREPOTION = "prep."
|
||||
CONJUNCTION = "conj."
|
||||
INTERJECTION = "interj."
|
||||
ABBREVIATION = "abbr."
|
||||
COMBINATION = "comb."
|
||||
SUFFIX = "suff." # 后缀
|
||||
|
||||
|
||||
class StateType(BaseChoices):
|
||||
REFUSED = "rf"
|
||||
CHECKING = "ck"
|
||||
PUBLISHED = "pb"
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print(list(StateType.choices()))
|
||||
print(StateType.is_valid(1))
|
||||
print(StateType.is_valid("refuse"))
|
||||
print(StateType.is_valid("rf"))
|
17
spider/normalutils/choices/htmlchoices.py
Normal file
17
spider/normalutils/choices/htmlchoices.py
Normal file
@ -0,0 +1,17 @@
|
||||
from normalutils.choices import BaseChoices
|
||||
|
||||
|
||||
class HttpMethod(BaseChoices):
|
||||
GET = "GET"
|
||||
POST = "POST"
|
||||
PUT = "PUT"
|
||||
PATCH = "PATCH"
|
||||
DELETE = "DELETE"
|
||||
OPTIONS = "OPOTIONS"
|
||||
|
||||
|
||||
class HtmlContentType(BaseChoices):
|
||||
TEXT_PLAIN = "text/plain"
|
||||
TEXT_HTML = "text/html"
|
||||
TEXT_MARKDOWN = "text/markdown"
|
||||
APPLICATION_JSON = "application/json"
|
79
spider/normalutils/spider.py
Normal file
79
spider/normalutils/spider.py
Normal file
@ -0,0 +1,79 @@
|
||||
from typing import Callable, Optional
|
||||
|
||||
from fake_useragent import UserAgent
|
||||
import httpx
|
||||
|
||||
from validator import Validator
|
||||
import httpchoices
|
||||
|
||||
|
||||
class NoValidator(Validator):
|
||||
def is_valid(self, raise_error):
|
||||
return True
|
||||
|
||||
|
||||
class Spider:
|
||||
validator_class: Validator = NoValidator
|
||||
parser: Callable[[str], dict] = None
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
url,
|
||||
method="GET",
|
||||
request_data: Optional[dict] = None,
|
||||
params: Optional[dict] = None,
|
||||
) -> None:
|
||||
self.useragent = UserAgent()
|
||||
self.headers = {"User-Agent": self.useragent.random}
|
||||
self.__data = {}
|
||||
self.__html = ""
|
||||
self.url = url
|
||||
self.method = method
|
||||
self.has_verified = False
|
||||
httpchoices.HttpMethod.is_valid(method, True)
|
||||
self.__request_parameters = {
|
||||
"data": request_data,
|
||||
"params": params,
|
||||
}
|
||||
|
||||
def get_parser(self):
|
||||
assert self.parser is not None
|
||||
return self.__class__.parser
|
||||
|
||||
async def __get_html(self) -> str:
|
||||
if self.__html != "":
|
||||
return self.__html
|
||||
async with httpx.AsyncClient() as client:
|
||||
try:
|
||||
response = await client.request(
|
||||
self.method,
|
||||
self.url,
|
||||
headers=self.headers,
|
||||
**self.__request_parameters
|
||||
)
|
||||
response.raise_for_status()
|
||||
self.__html = response.text
|
||||
except httpx.HTTPStatusError:
|
||||
pass
|
||||
return self.__html
|
||||
|
||||
async def __get_data(self) -> dict:
|
||||
if self.__data == {} or self.__data == []:
|
||||
html = await self.__get_html()
|
||||
self.__data = self.get_parser()(html)
|
||||
return self.__data
|
||||
|
||||
async def is_valid(self, raise_exception=False) -> bool:
|
||||
data = await self.__get_data()
|
||||
validator_class = self.validator_class(data)
|
||||
ans = validator_class.is_valid(raise_exception)
|
||||
if ans:
|
||||
self.has_verified = True
|
||||
return ans
|
||||
|
||||
async def data(self) -> dict:
|
||||
if self.has_verified:
|
||||
return self.__data
|
||||
else:
|
||||
await self.is_valid(True)
|
||||
return self.__data
|
0
spider/normalutils/utils/__init__.py
Normal file
0
spider/normalutils/utils/__init__.py
Normal file
21
spider/normalutils/utils/contenttohtml.py
Normal file
21
spider/normalutils/utils/contenttohtml.py
Normal file
@ -0,0 +1,21 @@
|
||||
from markdown import markdown
|
||||
import html
|
||||
|
||||
from normalutils.choices.htmlchoices import HtmlContentType
|
||||
|
||||
|
||||
def content_to_html(content: str, content_type=HtmlContentType.TEXT_MARKDOWN, title=None):
|
||||
if content_type == HtmlContentType.TEXT_MARKDOWN:
|
||||
content = markdown(content)
|
||||
return content
|
||||
elif content_type == HtmlContentType.TEXT_PLAIN:
|
||||
content = html.escape(content)
|
||||
content = content.split('\n')
|
||||
ret = ''
|
||||
for sentence in content:
|
||||
ret += ''.join(["<p>", sentence, "</p>\n"])
|
||||
# add title
|
||||
if title is not None:
|
||||
title = html.escape(title)
|
||||
return ''.join(['<h1>', title, '</h1>\n', ret])
|
||||
return ret
|
72
spider/normalutils/utils/random.py
Normal file
72
spider/normalutils/utils/random.py
Normal file
@ -0,0 +1,72 @@
|
||||
from typing import Callable
|
||||
import random
|
||||
from functools import wraps
|
||||
|
||||
from django.utils import timezone
|
||||
|
||||
|
||||
def random_str(typename: str, randomlength: int = 16) -> Callable[[None], str]:
|
||||
"""Parameter:
|
||||
----------
|
||||
type: 'common' [A-Za-z0-9]; 'lower' [a-z0-9]"""
|
||||
common = "AaBbCcDdEeFfGgHhJjKkLlMmNnOoPpQqRrSsTtUuVvWwXxYyZz0123456789"
|
||||
lower = "abcdefghijklmnopqrstuvwxyz0123456789"
|
||||
|
||||
if typename == 'common':
|
||||
chars = common
|
||||
elif typename == 'lower':
|
||||
chars = lower
|
||||
else:
|
||||
raise ValueError
|
||||
|
||||
def _do() -> str:
|
||||
length = len(chars) - 1
|
||||
ret = "".join([chars[random.randint(0, length)] for _ in range(randomlength)])
|
||||
return ret
|
||||
|
||||
return _do
|
||||
|
||||
|
||||
def create_random_unique_str(rand_func: Callable[[None], str]):
|
||||
time_string = None
|
||||
list_string = []
|
||||
|
||||
def create_random(rand_func: Callable[[None], str]):
|
||||
timestr = timezone.now().timestamp()
|
||||
timestr = str(int(timestr))
|
||||
ranstr = rand_func()
|
||||
ret = timestr + ranstr
|
||||
return ret, timestr
|
||||
|
||||
def get_unique_str():
|
||||
nonlocal time_string
|
||||
nonlocal list_string
|
||||
while True:
|
||||
ret, timestr = create_random(rand_func)
|
||||
if time_string != timestr:
|
||||
time_string = timestr
|
||||
list_string = [ret]
|
||||
return ret
|
||||
else:
|
||||
if ret not in list_string:
|
||||
list_string.append(ret)
|
||||
return ret
|
||||
|
||||
def decrator_func(func):
|
||||
@wraps(func)
|
||||
def _do():
|
||||
return get_unique_str()
|
||||
return _do
|
||||
|
||||
return decrator_func
|
||||
# return get_unique_str
|
||||
|
||||
|
||||
@create_random_unique_str(random_str('common', 2))
|
||||
def default_nickname() -> str:
|
||||
pass
|
||||
|
||||
|
||||
@create_random_unique_str(random_str('lower', 1))
|
||||
def default_version_unique_id() -> str:
|
||||
pass
|
14
spider/normalutils/utils/timeit.py
Normal file
14
spider/normalutils/utils/timeit.py
Normal file
@ -0,0 +1,14 @@
|
||||
from time import time
|
||||
from functools import wraps
|
||||
|
||||
|
||||
def timeit(func):
|
||||
@wraps(func)
|
||||
def _totime(*args, **kwargs):
|
||||
st = time()
|
||||
ans = func(*args, **kwargs)
|
||||
end = time()
|
||||
print("'{}' use time: {}".format(func.__name__, end - st))
|
||||
return ans
|
||||
|
||||
return _totime
|
13
spider/normalutils/utils/validation.py
Normal file
13
spider/normalutils/utils/validation.py
Normal file
@ -0,0 +1,13 @@
|
||||
from validator import ValidationError
|
||||
|
||||
|
||||
def validate_lenth(value: str, max_length: int, min_length: int = 4):
|
||||
length = len(value)
|
||||
if length > max_length:
|
||||
raise ValidationError(
|
||||
"Length is {}. It is longer than {}".format(length, max_length)
|
||||
)
|
||||
elif length < min_length:
|
||||
raise ValidationError(
|
||||
"Length is {}. It is shorter than {}".format(length, min_length)
|
||||
)
|
56
spider/omodels.py
Normal file
56
spider/omodels.py
Normal file
@ -0,0 +1,56 @@
|
||||
from datetime import datetime
|
||||
from sqlalchemy import create_engine, Column
|
||||
from sqlalchemy.ext.declarative import declarative_base
|
||||
from sqlalchemy.orm import sessionmaker, relationship
|
||||
from sqlalchemy.sql.schema import ForeignKey
|
||||
from sqlalchemy.sql.sqltypes import Boolean, DateTime, Integer, String
|
||||
|
||||
from normalutils.choices import StateType
|
||||
|
||||
|
||||
oengine = create_engine("sqlite:///db.sqlite3", future=True)
|
||||
Base = declarative_base()
|
||||
OSession = sessionmaker(oengine, future=True)
|
||||
|
||||
|
||||
class OWord(Base):
|
||||
__tablename__ = "word_word"
|
||||
id = Column("word_id", Integer, primary_key=True)
|
||||
spelling = Column(String(64), nullable=False, unique=True)
|
||||
importance = Column(String(32), nullable=True)
|
||||
state = Column(String(2), default=StateType.CHECKING.value, nullable=False)
|
||||
|
||||
|
||||
class OMeaningField(Base):
|
||||
__tablename__ = "meaning_meaningfield"
|
||||
id = Column("meaningfield_id", Integer, primary_key=True)
|
||||
current_version = Column(Integer, default=1)
|
||||
has_many = Column(Boolean, default=False)
|
||||
word_id = Column(Integer, ForeignKey("word_word.word_id", ondelete="CASCADE"))
|
||||
|
||||
word = relationship("OWord", backref="meaningfields")
|
||||
|
||||
|
||||
class OMeaning(Base):
|
||||
__tablename__ = "meaning_meaning"
|
||||
id = Column("meaning_id", Integer, primary_key=True)
|
||||
meaningfield_id = Column(
|
||||
Integer, ForeignKey("meaning_meaningfield.meaningfield_id", ondelete="CASCADE")
|
||||
)
|
||||
author_id = Column(Integer, nullable=True)
|
||||
author_name = Column(String(64), nullable=True)
|
||||
state = Column(String(2), default=StateType.CHECKING.value, nullable=False)
|
||||
word_property = Column(String(8), nullable=False)
|
||||
field = Column(String(64), nullable=True)
|
||||
version = Column(Integer, default=1, nullable=False)
|
||||
meaning = Column(String(128), nullable=False)
|
||||
sentence = Column(String(256), nullable=True)
|
||||
add_time = Column(DateTime, default=datetime.utcnow)
|
||||
|
||||
meaningfield = relationship("OMeaningField", backref="meanings")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
with OSession() as session:
|
||||
meaning = session.query(OMeaning).first()
|
||||
print(meaning.meaningfield.word.id)
|
96
spider/renderercontents.py
Normal file
96
spider/renderercontents.py
Normal file
@ -0,0 +1,96 @@
|
||||
import re
|
||||
from wordpropertyconversion import word_property_conversion
|
||||
from parsel import Selector
|
||||
|
||||
from models import Session, WordData
|
||||
|
||||
|
||||
def renderer_word(txt) -> dict:
|
||||
sel = Selector(txt)
|
||||
# word
|
||||
spelling = sel.css(".keyword::text").get()
|
||||
importance = sel.xpath("//span[@class='via rank']/text()").get()
|
||||
word = {
|
||||
"spelling": spelling,
|
||||
"importance": importance
|
||||
}
|
||||
return word
|
||||
|
||||
|
||||
def renderer_meaningslist(txt) -> list:
|
||||
# meanings
|
||||
sel = Selector(txt)
|
||||
sel.css("#synonyms").remove()
|
||||
meanings_list = sel.css(".trans-container")
|
||||
if meanings_list == []:
|
||||
return []
|
||||
else:
|
||||
meanings_list = meanings_list[0].xpath("//div/ul/li/text()").getall()
|
||||
# meanings_list = sel.xpath("//div[@class='trans-container'][1]").xpath("//div/ul/li/text()").getall()
|
||||
meanings = map(renderer_meaning, meanings_list)
|
||||
meanings = list(meanings)
|
||||
while None in meanings:
|
||||
meanings.remove(None)
|
||||
return meanings
|
||||
|
||||
|
||||
def renderer_meaning(text):
|
||||
word_property = re.match(r"[a-z]{1,8}\.", text)
|
||||
if word_property is None:
|
||||
return None
|
||||
word_property = word_property.group()
|
||||
word_property = word_property_conversion(word_property)
|
||||
length = len(word_property)
|
||||
meaning = text[length+1:]
|
||||
return {
|
||||
"word_property": word_property,
|
||||
"meaning": meaning
|
||||
}
|
||||
|
||||
|
||||
def has_value_to_render(text):
|
||||
sel = Selector(text)
|
||||
return sel.css(".error-typo") == []
|
||||
|
||||
|
||||
def testcase1():
|
||||
with Session() as session:
|
||||
# data = session.query(WordData).first()
|
||||
data = session.query(WordData).filter_by(word="ob").first() # the
|
||||
text = data.html
|
||||
# print(parser_worddict(text))
|
||||
# print(renderer_meaningslist(text))
|
||||
# print(has_value_to_render(text))
|
||||
# astr = "[ 过去式 researched 过去分词 researched 现在分词 researching ]"
|
||||
astr = "linux下的桌面环境"
|
||||
ans = renderer_meaning(astr)
|
||||
print(ans)
|
||||
|
||||
|
||||
def testcase3():
|
||||
with Session() as session:
|
||||
data = session.query(WordData).filter_by(word="search").one()
|
||||
text = data.html
|
||||
ans = renderer_meaningslist(text)
|
||||
print(ans)
|
||||
|
||||
|
||||
def testcase4():
|
||||
"test word importance None"
|
||||
with Session() as session:
|
||||
data = session.query(WordData).filter_by(word="john").one()
|
||||
text = data.html
|
||||
ans = renderer_word(text)
|
||||
print(ans)
|
||||
print(type(ans["importance"]))
|
||||
|
||||
|
||||
def testcase2():
|
||||
txt = """
|
||||
<div id="results-contents" class="results-content"><div class="trans-wrapper" id="phrsListTab"><h2 class="wordbook-js"><span class="keyword">hentai</span></h2></div><div id="wordArticle" class="trans-wrapper trans-tab"><h3><span class="tabs"></span></h3><div id="wordArticleToggle"></div></div></div>
|
||||
"""
|
||||
renderer_meaningslist(txt)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
testcase4()
|
71
spider/serializers.py
Normal file
71
spider/serializers.py
Normal file
@ -0,0 +1,71 @@
|
||||
from validator import Validator, fields, FieldValidationError
|
||||
from normalutils.choices import WordPropertyType
|
||||
|
||||
from models import Word, Meaning, Session
|
||||
|
||||
|
||||
class WordAddSerializer(Validator):
|
||||
spelling = fields.StringField(1, 64, allow_null=False)
|
||||
importance = fields.StringField(required=False)
|
||||
|
||||
def __init__(self, raw_data, session: Session, *args, **kwargs):
|
||||
super().__init__(raw_data)
|
||||
self.__session = session
|
||||
|
||||
def save(self):
|
||||
assert self.errors == {}
|
||||
return self.create(self.validated_data)
|
||||
|
||||
def create(self, data):
|
||||
session = self.__session
|
||||
word = Word(**data)
|
||||
session.add(word)
|
||||
return word
|
||||
|
||||
|
||||
class MeaningAddSerializer(Validator):
|
||||
meaning = fields.StringField()
|
||||
word_property = fields.StringField()
|
||||
sentence = fields.StringField(required=False)
|
||||
|
||||
def __init__(self, raw_data, session: Session, word, *args, **kwargs):
|
||||
super().__init__(raw_data)
|
||||
self.__session = session
|
||||
self.__word = word
|
||||
|
||||
def validate_word_property(self, data):
|
||||
try:
|
||||
WordPropertyType.is_valid(data, True)
|
||||
except Exception as e:
|
||||
raise FieldValidationError(e)
|
||||
|
||||
def save(self):
|
||||
assert self.errors == {}
|
||||
return self.create(self.validated_data)
|
||||
|
||||
def create(self, data):
|
||||
session = self.__session
|
||||
meaning = Meaning(word=self.__word, **data)
|
||||
session.add(meaning)
|
||||
return meaning
|
||||
|
||||
|
||||
def testcase1():
|
||||
data = {
|
||||
"spelling": "a",
|
||||
"meaning": "haha",
|
||||
"word_property": "n."
|
||||
}
|
||||
with Session() as session:
|
||||
serializer = WordAddSerializer(data, session)
|
||||
serializer.is_valid()
|
||||
serializer.save()
|
||||
session.commit()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
data = {'spelling': "None", 'importance': None}
|
||||
with Session() as session:
|
||||
serializer = WordAddSerializer(data, session)
|
||||
flag = serializer.is_valid()
|
||||
print(serializer.validated_data)
|
BIN
spider/spider.sqlite3
Normal file
BIN
spider/spider.sqlite3
Normal file
Binary file not shown.
5
spider/validator/__init__.py
Normal file
5
spider/validator/__init__.py
Normal file
@ -0,0 +1,5 @@
|
||||
__version__ = '0.0.8'
|
||||
|
||||
from .validator import Validator, create_validator
|
||||
from .fields import *
|
||||
from .exceptions import *
|
69
spider/validator/exceptions.py
Normal file
69
spider/validator/exceptions.py
Normal file
@ -0,0 +1,69 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import unicode_literals
|
||||
import six
|
||||
from .utils import force_text, force_str
|
||||
from .translation import gettext as _
|
||||
|
||||
|
||||
def _flat_error_detail(detail):
|
||||
if isinstance(detail, list):
|
||||
return [_flat_error_detail(item) for item in detail]
|
||||
elif isinstance(detail, dict):
|
||||
return {
|
||||
key: _flat_error_detail(value)
|
||||
for key, value in six.iteritems(detail)
|
||||
}
|
||||
else:
|
||||
return force_text(detail)
|
||||
|
||||
|
||||
class BaseValidationError(Exception):
|
||||
|
||||
default_detail = _('Base validation error')
|
||||
default_code = _('error')
|
||||
|
||||
def __init__(self, detail=None, code=None):
|
||||
"""
|
||||
:param detail: `detail` maybe a string, a dict or a list.
|
||||
:param code: error code, it not used for now.
|
||||
"""
|
||||
if detail is None:
|
||||
detail = self.default_detail
|
||||
if code is None:
|
||||
code = self.default_code
|
||||
|
||||
self.detail = _flat_error_detail(detail)
|
||||
self.code = code
|
||||
|
||||
def get_detail(self):
|
||||
return self.detail
|
||||
|
||||
def __str__(self):
|
||||
return force_str(self.detail)
|
||||
|
||||
def __unicode__(self):
|
||||
return force_text(self.detail)
|
||||
|
||||
def __repr__(self):
|
||||
detail = self.detail
|
||||
if len(detail) > 103:
|
||||
detail = detail[:100] + '...'
|
||||
return '{0}(detail={1!r})'.format(self.__class__.__name__, detail)
|
||||
|
||||
|
||||
class FieldRequiredError(BaseValidationError):
|
||||
|
||||
default_detail = _('Field is required')
|
||||
default_code = _('error')
|
||||
|
||||
|
||||
class ValidationError(BaseValidationError):
|
||||
|
||||
default_detail = _('Validation error')
|
||||
default_code = _('error')
|
||||
|
||||
|
||||
class FieldValidationError(BaseValidationError):
|
||||
|
||||
default_detail = _('field Validation error')
|
||||
default_code = _('error')
|
781
spider/validator/fields.py
Normal file
781
spider/validator/fields.py
Normal file
@ -0,0 +1,781 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import unicode_literals
|
||||
import six
|
||||
import random
|
||||
import string
|
||||
import sys
|
||||
import uuid
|
||||
import re
|
||||
import copy
|
||||
import datetime
|
||||
from collections import OrderedDict
|
||||
from six.moves import urllib_parse as urlparse, range
|
||||
from IPy import IP, MAX_IPV4_ADDRESS, MAX_IPV6_ADDRESS
|
||||
from . import exceptions
|
||||
from .utils import force_text
|
||||
from .translation import gettext as _
|
||||
|
||||
__all__ = [
|
||||
# Don't need to add field to here by hand,
|
||||
# BaseFieldMetaClass will auto add field to here.
|
||||
]
|
||||
|
||||
FIELDS_NAME_MAP = {
|
||||
# Don't need to add field to here by hand,
|
||||
# BaseFieldMetaClass will auto add field to here.
|
||||
}
|
||||
|
||||
|
||||
def create_field(field_info):
|
||||
"""
|
||||
Create a field by field info dict.
|
||||
"""
|
||||
field_type = field_info.get('type')
|
||||
if field_type not in FIELDS_NAME_MAP:
|
||||
raise ValueError(_('not support this field: {}').format(field_type))
|
||||
field_class = FIELDS_NAME_MAP.get(field_type)
|
||||
params = dict(field_info)
|
||||
params.pop('type')
|
||||
return field_class.from_dict(params)
|
||||
|
||||
|
||||
class EmptyValue(object):
|
||||
"""
|
||||
a data type replace None
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def __str__(self):
|
||||
return '__empty_value__'
|
||||
|
||||
def __repr__(self):
|
||||
return '<{}>'.format(self.__class__.__name__)
|
||||
|
||||
|
||||
EMPTY_VALUE = EmptyValue()
|
||||
|
||||
|
||||
class BaseFieldMetaClass(type):
|
||||
|
||||
def __new__(cls, name, bases, attrs):
|
||||
__all__.append(name)
|
||||
clazz = super(BaseFieldMetaClass, cls).__new__(cls, name, bases, attrs)
|
||||
field_name = attrs.get('FIELD_TYPE_NAME')
|
||||
if field_name is not None and field_name != 'object':
|
||||
FIELDS_NAME_MAP[field_name] = clazz
|
||||
return clazz
|
||||
|
||||
|
||||
@six.add_metaclass(BaseFieldMetaClass)
|
||||
class BaseField(object):
|
||||
"""
|
||||
BaseField
|
||||
"""
|
||||
|
||||
"""
|
||||
INTERNAL_TYPE is the type of the field in python internal, like str, int, list, dict
|
||||
INTERNAL_TYPE can be a type list, such as [int, long]
|
||||
INTERNAL_TYPE used to validate field's type by isinstance(value, INTERNAL_TYPE)
|
||||
"""
|
||||
INTERNAL_TYPE = object
|
||||
|
||||
FIELD_TYPE_NAME = 'object'
|
||||
|
||||
PARAMS = [
|
||||
'strict', 'default', 'validators', 'required'
|
||||
]
|
||||
|
||||
def __init__(self, strict=True, default=EMPTY_VALUE, validators=None, required=True, allow_null=False, **kwargs):
|
||||
"""
|
||||
:param strict: bool, if strict is True, value must be an instance of INTERVAL_TYPE,
|
||||
otherwise, value should be convert to INTERNAL_TYPE
|
||||
:param default: default value, defaults to EMPTY_VALUE
|
||||
:param validators: a validator list, validator can be function, other callable object or object that have method named validate
|
||||
:param required: bool, indicate that this field is whether required
|
||||
"""
|
||||
self.strict = strict
|
||||
self.default = default
|
||||
self.allow_null = allow_null
|
||||
|
||||
if validators is None:
|
||||
validators = []
|
||||
elif not isinstance(validators, (tuple, list)):
|
||||
validators = [validators]
|
||||
self.validators = validators
|
||||
|
||||
self.required = required
|
||||
|
||||
def __str__(self):
|
||||
return self.__class__.__name__
|
||||
|
||||
@classmethod
|
||||
def _check_value_range(cls, min_value, max_value):
|
||||
if max_value is not None and max_value < min_value:
|
||||
raise ValueError(_('the max value must greater than or equals the min value, got min value={min}, max value={max}').format(
|
||||
min=min_value, max=max_value))
|
||||
|
||||
def _convert_type(self, value):
|
||||
if isinstance(self.INTERNAL_TYPE, (tuple, list)):
|
||||
for t in self.INTERNAL_TYPE:
|
||||
try:
|
||||
value = t(value)
|
||||
break
|
||||
except TypeError as e:
|
||||
pass
|
||||
else:
|
||||
raise ValueError()
|
||||
else:
|
||||
value = self.INTERNAL_TYPE(value)
|
||||
return value
|
||||
|
||||
@classmethod
|
||||
def _get_all_params(cls):
|
||||
"""
|
||||
Collect all PARAMS from this class and its parent class.
|
||||
"""
|
||||
params = list(cls.PARAMS)
|
||||
bases = cls.__bases__
|
||||
for base in bases:
|
||||
if issubclass(base, BaseField):
|
||||
params.extend(base._get_all_params())
|
||||
return params
|
||||
|
||||
def validate(self, value):
|
||||
"""
|
||||
return validated value or raise FieldValidationError.
|
||||
"""
|
||||
if not self.required:
|
||||
return value
|
||||
if not self.allow_null and value is None:
|
||||
raise exceptions.FieldValidationError(_("value can't be 'null'."))
|
||||
|
||||
value = self._validate(value)
|
||||
for v in self.validators:
|
||||
v(value)
|
||||
return value
|
||||
|
||||
def _validate(self, value):
|
||||
"""
|
||||
return validated value or raise FieldValidationError.
|
||||
sub-class should override this method.
|
||||
"""
|
||||
return self._validate_type(value)
|
||||
|
||||
def _validate_type(self, value):
|
||||
"""
|
||||
validate the type of value
|
||||
"""
|
||||
if not isinstance(value, self.INTERNAL_TYPE):
|
||||
if self.strict:
|
||||
raise exceptions.FieldValidationError(
|
||||
_('got a wrong type: {0}, expect {1}').format(type(value).__name__, self.FIELD_TYPE_NAME))
|
||||
else:
|
||||
try:
|
||||
value = self._convert_type(value)
|
||||
except (ValueError, TypeError) as e:
|
||||
raise exceptions.FieldValidationError(
|
||||
_('type convertion({0} -> {1}) is failed: {2}').format(type(value).__name__, self.FIELD_TYPE_NAME, str(e)))
|
||||
return value
|
||||
|
||||
def is_required(self):
|
||||
return self.required
|
||||
|
||||
def get_default(self):
|
||||
"""
|
||||
return default value
|
||||
"""
|
||||
if callable(self.default):
|
||||
return self.default()
|
||||
else:
|
||||
return self.default
|
||||
|
||||
def to_presentation(self, value):
|
||||
"""
|
||||
value: must be a internal value
|
||||
"""
|
||||
return value
|
||||
|
||||
def to_internal(self, value):
|
||||
"""
|
||||
value: must be a validated value
|
||||
"""
|
||||
return value
|
||||
|
||||
def to_dict(self):
|
||||
"""
|
||||
to dict presentation
|
||||
"""
|
||||
d = {
|
||||
'type': self.FIELD_TYPE_NAME,
|
||||
}
|
||||
params = self._get_all_params()
|
||||
for name in params:
|
||||
if hasattr(self, name):
|
||||
value = getattr(self, name)
|
||||
# 处理特殊值
|
||||
if value is EMPTY_VALUE:
|
||||
value = '__empty__'
|
||||
d[name] = value
|
||||
return d
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, params):
|
||||
"""
|
||||
Create a field from params.
|
||||
sub-class can override this method.
|
||||
"""
|
||||
if params.get('default') == '__empty__':
|
||||
params['default'] = EMPTY_VALUE
|
||||
return cls(**params)
|
||||
|
||||
def mock_data(self):
|
||||
"""
|
||||
reutrn mocking data
|
||||
sub-class should override this method
|
||||
"""
|
||||
return 'this field doesnt implement mock_data method'
|
||||
|
||||
|
||||
class StringField(BaseField):
|
||||
"""
|
||||
StringField
|
||||
internal: six.string_types
|
||||
presentation: string
|
||||
"""
|
||||
if six.PY2:
|
||||
INTERNAL_TYPE = (unicode, str)
|
||||
else:
|
||||
INTERNAL_TYPE = str
|
||||
FIELD_TYPE_NAME = 'string'
|
||||
PARAMS = ['min_length', 'max_length', 'regex']
|
||||
|
||||
def __init__(self, min_length=0, max_length=None, regex=None, **kwargs):
|
||||
if min_length < 0:
|
||||
min_length = 0
|
||||
self._check_value_range(min_length, max_length)
|
||||
self.min_length = min_length
|
||||
self.max_length = max_length
|
||||
|
||||
if isinstance(regex, six.string_types):
|
||||
regex = re.compile(regex)
|
||||
self.regex = regex
|
||||
|
||||
super(StringField, self).__init__(**kwargs)
|
||||
|
||||
def _validate(self, value):
|
||||
value = self._validate_type(value)
|
||||
|
||||
if len(value) < self.min_length:
|
||||
raise exceptions.FieldValidationError(
|
||||
_('string is too short, min-length is {}').format(self.min_length))
|
||||
if self.max_length and len(value) > self.max_length:
|
||||
raise exceptions.FieldValidationError(
|
||||
_('string is too long, max-length is {}').format(self.max_length))
|
||||
|
||||
if not self._match(value):
|
||||
raise exceptions.FieldValidationError(
|
||||
_('{0} not match {1}').format(self.regex.pattern, value))
|
||||
|
||||
return value
|
||||
|
||||
def _match(self, value):
|
||||
if self.regex is None:
|
||||
return True
|
||||
else:
|
||||
return self.regex.match(value) is not None
|
||||
|
||||
def to_internal(self, value):
|
||||
if value is None:
|
||||
return value
|
||||
return six.text_type(value)
|
||||
|
||||
def mock_data(self):
|
||||
min_ = self.min_length
|
||||
max_ = self.max_length
|
||||
if max_ is None:
|
||||
max_ = min_ + 100
|
||||
size = random.randint(min_, max_)
|
||||
random_str = ''.join(
|
||||
[random.choice(string.ascii_letters + string.digits) for _ in range(size)])
|
||||
random_str = self.to_internal(random_str)
|
||||
return random_str
|
||||
|
||||
|
||||
class NumberField(BaseField):
|
||||
if six.PY2:
|
||||
INTERNAL_TYPE = (int, long, float)
|
||||
else:
|
||||
INTERNAL_TYPE = (int, float)
|
||||
FIELD_TYPE_NAME = 'number'
|
||||
PARAMS = ['min_value', 'max_value']
|
||||
|
||||
def __init__(self, min_value=None, max_value=None, **kwargs):
|
||||
self._check_value_range(min_value, max_value)
|
||||
self.min_value = min_value
|
||||
self.max_value = max_value
|
||||
|
||||
super(NumberField, self).__init__(**kwargs)
|
||||
|
||||
def _validate(self, value):
|
||||
value = self._validate_type(value)
|
||||
|
||||
if self.min_value is not None and value < self.min_value:
|
||||
raise exceptions.FieldValidationError(
|
||||
_('value is too small, min-value is {}').format(self.min_value))
|
||||
|
||||
if self.max_value is not None and value > self.max_value:
|
||||
raise exceptions.FieldValidationError(
|
||||
_('value is too big, max-value is {}').format(self.max_value))
|
||||
|
||||
return value
|
||||
|
||||
def mock_data(self):
|
||||
min_ = self.min_value
|
||||
if min_ is None:
|
||||
min_ = 0
|
||||
max_ = self.max_value
|
||||
if max_ is None:
|
||||
max_ = min_ + 1000
|
||||
return random.uniform(min_, max_)
|
||||
|
||||
|
||||
class IntegerField(NumberField):
|
||||
INTERNAL_TYPE = int
|
||||
FIELD_TYPE_NAME = 'integer'
|
||||
PARAMS = []
|
||||
|
||||
def mock_data(self):
|
||||
d = super(IntegerField, self).mock_data()
|
||||
return int(d)
|
||||
|
||||
|
||||
class FloatField(NumberField):
|
||||
INTERNAL_TYPE = float
|
||||
FIELD_TYPE_NAME = 'float'
|
||||
PARAMS = []
|
||||
|
||||
|
||||
class BoolField(BaseField):
|
||||
INTERNAL_TYPE = bool
|
||||
FIELD_TYPE_NAME = 'bool'
|
||||
PARAMS = []
|
||||
|
||||
def mock_data(self):
|
||||
return random.choice([True, False])
|
||||
|
||||
|
||||
class UUIDField(BaseField):
|
||||
INTERNAL_TYPE = uuid.UUID
|
||||
FIELD_TYPE_NAME = 'UUID'
|
||||
PARAMS = ['format']
|
||||
SUPPORT_FORMATS = {
|
||||
'hex': 'hex',
|
||||
'str': '__str__',
|
||||
'int': 'int',
|
||||
'bytes': 'bytes',
|
||||
'bytes_le': 'bytes_le'
|
||||
}
|
||||
|
||||
def __init__(self, format='hex', **kwargs):
|
||||
"""
|
||||
format: what format used when to_presentation, supports 'hex', 'str', 'int', 'bytes', 'bytes_le'
|
||||
"""
|
||||
if format not in self.SUPPORT_FORMATS:
|
||||
raise ValueError(_('not supports format: {}').format(format))
|
||||
self.format = format
|
||||
|
||||
kwargs.setdefault('strict', False)
|
||||
super(UUIDField, self).__init__(**kwargs)
|
||||
|
||||
def _validate(self, value):
|
||||
value = self._validate_type(value)
|
||||
return value
|
||||
|
||||
def to_presentation(self, value):
|
||||
assert isinstance(value, self.INTERNAL_TYPE)
|
||||
attr = getattr(value, self.SUPPORT_FORMATS[self.format])
|
||||
if callable(attr):
|
||||
return attr()
|
||||
return attr
|
||||
|
||||
def mock_data(self):
|
||||
return uuid.uuid4()
|
||||
|
||||
|
||||
class MD5Field(StringField):
|
||||
FIELD_TYPE_NAME = 'md5'
|
||||
PARAMS = []
|
||||
REGEX = r'[\da-fA-F]{32}'
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
kwargs['strict'] = True
|
||||
super(MD5Field, self).__init__(min_length=32,
|
||||
max_length=32,
|
||||
regex=self.REGEX,
|
||||
**kwargs)
|
||||
|
||||
def _validate(self, value):
|
||||
try:
|
||||
return super(MD5Field, self)._validate(value)
|
||||
except exceptions.FieldValidationError as e:
|
||||
raise exceptions.FieldValidationError(
|
||||
_('Got wrong md5 value: {}').format(value))
|
||||
|
||||
def mock_data(self):
|
||||
return ''.join([random.choice(string.hexdigits) for i in range(32)])
|
||||
|
||||
|
||||
class SHAField(StringField):
|
||||
FIELD_TYPE_NAME = 'sha'
|
||||
SUPPORT_VERSION = [1, 224, 256, 384, 512]
|
||||
PARAMS = ['version']
|
||||
|
||||
def __init__(self, version=256, **kwargs):
|
||||
if version not in self.SUPPORT_VERSION:
|
||||
raise ValueError(_('{0} not support, support versions are: {1}').format(
|
||||
version, self.SUPPORT_VERSION))
|
||||
if version == 1:
|
||||
length = 40
|
||||
else:
|
||||
length = int(version / 8 * 2)
|
||||
self.version = version
|
||||
self.length = length
|
||||
kwargs['strict'] = True
|
||||
super(SHAField, self).__init__(min_length=length,
|
||||
max_length=length,
|
||||
regex=r'[\da-fA-F]{' +
|
||||
str(length) + '}',
|
||||
**kwargs)
|
||||
|
||||
def _validate(self, value):
|
||||
try:
|
||||
return super(SHAField, self)._validate(value)
|
||||
except exceptions.FieldValidationError as e:
|
||||
raise exceptions.FieldValidationError(
|
||||
_('Got wrong sha{0} value: {1}').format(self.version, value))
|
||||
|
||||
def mock_data(self):
|
||||
return ''.join([random.choice(string.hexdigits) for i in range(self.length)])
|
||||
|
||||
|
||||
class EmailField(StringField):
|
||||
FIELD_TYPE_NAME = 'email'
|
||||
REGEX = r'^[a-zA-Z0-9.!#$%&\'*+/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$'
|
||||
PARAMS = []
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
kwargs['strict'] = True
|
||||
super(EmailField, self).__init__(regex=self.REGEX, **kwargs)
|
||||
|
||||
def _validate(self, value):
|
||||
try:
|
||||
return super(EmailField, self)._validate(value)
|
||||
except exceptions.FieldValidationError as e:
|
||||
raise exceptions.FieldValidationError(
|
||||
_('Got wrong email value: {}').format(value))
|
||||
|
||||
def mock_data(self):
|
||||
name = ''.join(random.sample(string.ascii_lowercase, 5))
|
||||
domain = '{0}.com'.format(
|
||||
''.join(random.sample(string.ascii_lowercase, 3)))
|
||||
return '{0}@{1}'.format(name, domain)
|
||||
|
||||
|
||||
class IPAddressField(BaseField):
|
||||
INTERNAL_TYPE = IP
|
||||
FIELD_TYPE_NAME = 'ip_address'
|
||||
PARAMS = ['version']
|
||||
SUPPORT_VERSIONS = ['ipv4', 'ipv6', 'both']
|
||||
|
||||
def __init__(self, version='both', **kwargs):
|
||||
if version not in self.SUPPORT_VERSIONS:
|
||||
raise ValueError(_('{} version is not supported').format(version))
|
||||
self.version = version
|
||||
|
||||
kwargs.setdefault('strict', False)
|
||||
super(IPAddressField, self).__init__(**kwargs)
|
||||
|
||||
def _validate(self, value):
|
||||
try:
|
||||
value = IP(value)
|
||||
except ValueError as e:
|
||||
raise exceptions.FieldValidationError(str(e))
|
||||
if self.version == 'ipv4' and value.version() != 4:
|
||||
raise exceptions.FieldValidationError(
|
||||
_('expected an ipv4 address, got {}').format(value.strNormal()))
|
||||
if self.version == 'ipv6' and value.version() != 6:
|
||||
raise exceptions.FieldValidationError(
|
||||
-('expected an ipv6 address, got {}').format(value.strNormal()))
|
||||
return value
|
||||
|
||||
def to_presentation(self, value):
|
||||
return value.strNormal()
|
||||
|
||||
def mock_data(self):
|
||||
v = self.version
|
||||
if v == 'both':
|
||||
v = random.choice(['ipv4', 'ipv6'])
|
||||
|
||||
if v == 'ipv4':
|
||||
ip = random.randint(0, MAX_IPV4_ADDRESS)
|
||||
return IP(ip)
|
||||
else:
|
||||
ip = random.randint(0, MAX_IPV6_ADDRESS)
|
||||
return IP(ip)
|
||||
|
||||
|
||||
class URLField(StringField):
|
||||
FIELD_TYPE_NAME = 'url'
|
||||
PARAMS = []
|
||||
SCHEMAS = ('http', 'https')
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
kwargs['strict'] = True
|
||||
super(URLField, self).__init__(min_length=0, **kwargs)
|
||||
|
||||
def _validate(self, value):
|
||||
value = self._validate_type(value)
|
||||
url = urlparse.urlparse(value)
|
||||
if url.scheme not in self.SCHEMAS:
|
||||
raise exceptions.FieldValidationError(_('schema is lost'))
|
||||
if url.hostname == '':
|
||||
raise exceptions.FieldValidationError(_('hostname is lost'))
|
||||
return url.geturl()
|
||||
|
||||
def mock_data(self):
|
||||
return 'http://www.example.com/media/image/demo.jpg'
|
||||
|
||||
|
||||
class EnumField(BaseField):
|
||||
INTERNAL_TYPE = object
|
||||
FIELD_TYPE_NAME = 'enum'
|
||||
PARAMS = ['choices']
|
||||
|
||||
def __init__(self, choices=None, **kwargs):
|
||||
if choices is None or len(choices) == 0:
|
||||
raise ValueError('choices cant be empty or None')
|
||||
self.choices = choices
|
||||
|
||||
super(EnumField, self).__init__(**kwargs)
|
||||
|
||||
def _validate(self, value):
|
||||
if value not in self.choices:
|
||||
raise exceptions.FieldValidationError(
|
||||
_('{!r} not in the choices').format(value))
|
||||
return value
|
||||
|
||||
def mock_data(self):
|
||||
return random.choice(self.choices)
|
||||
|
||||
|
||||
class DictField(BaseField):
|
||||
INTERNAL_TYPE = dict
|
||||
FIELD_TYPE_NAME = 'dict'
|
||||
PARAMS = ['validator']
|
||||
|
||||
def __init__(self, validator=None, **kwargs):
|
||||
"""
|
||||
:param validator: Validator object
|
||||
"""
|
||||
self.validator = validator
|
||||
super(DictField, self).__init__(**kwargs)
|
||||
|
||||
def _validate(self, value):
|
||||
value = self._validate_type(value)
|
||||
|
||||
if self.validator:
|
||||
v = self.validator(value)
|
||||
if v.is_valid():
|
||||
value = v.validated_data
|
||||
else:
|
||||
raise exceptions.FieldValidationError(v.errors)
|
||||
else:
|
||||
value = copy.deepcopy(value)
|
||||
return value
|
||||
|
||||
def to_dict(self):
|
||||
d = super(DictField, self).to_dict()
|
||||
if d['validator'] is not None:
|
||||
d['validator'] = d['validator'].to_dict()
|
||||
return d
|
||||
|
||||
def mock_data(self):
|
||||
if self.validator:
|
||||
return self.validator.mock_data()
|
||||
else:
|
||||
return {}
|
||||
|
||||
|
||||
class ListField(BaseField):
|
||||
INTERNAL_TYPE = (list, tuple)
|
||||
FIELD_TYPE_NAME = 'list'
|
||||
PARAMS = ['field', 'min_length', 'max_length']
|
||||
|
||||
def __init__(self, field=None, min_length=0, max_length=None, **kwargs):
|
||||
if field is not None and not isinstance(field, BaseField):
|
||||
raise ValueError(
|
||||
_('field param expect a instance of BaseField, but got {!r}').format(field))
|
||||
self.field = field
|
||||
|
||||
self._check_value_range(min_length, max_length)
|
||||
self.min_length = min_length
|
||||
self.max_length = max_length
|
||||
|
||||
super(ListField, self).__init__(**kwargs)
|
||||
|
||||
def _validate(self, value):
|
||||
value = self._validate_type(value)
|
||||
if self.min_length is not None and len(value) < self.min_length:
|
||||
raise exceptions.FieldValidationError(
|
||||
_('this list has too few elements, min length is {}').format(self.min_length))
|
||||
|
||||
if self.max_length is not None and len(value) > self.max_length:
|
||||
raise exceptions.FieldValidationError(
|
||||
_('this list has too many elements, max length is {}').format(self.max_length))
|
||||
|
||||
if self.field:
|
||||
new_value = []
|
||||
for item in value:
|
||||
new_item = self.field.validate(item)
|
||||
new_value.append(new_item)
|
||||
value = new_value
|
||||
else:
|
||||
value = copy.deepcopy(value)
|
||||
return value
|
||||
|
||||
def to_dict(self):
|
||||
d = super(ListField, self).to_dict()
|
||||
if d['field'] is not None:
|
||||
d['field'] = d['field'].to_dict()
|
||||
return d
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, params):
|
||||
if 'field' in params and isinstance(params['field'], dict):
|
||||
params['field'] = create_field(params['field'])
|
||||
return super(ListField, cls).from_dict(params)
|
||||
|
||||
def mock_data(self):
|
||||
min_ = self.min_length
|
||||
if min_ is None:
|
||||
min_ = 0
|
||||
max_ = self.max_length
|
||||
if max_ is None:
|
||||
max_ = 10
|
||||
length = random.choice(range(min_, max_))
|
||||
|
||||
data = [None] * length
|
||||
if self.field:
|
||||
for i in range(length):
|
||||
data[i] = self.field.mock_data()
|
||||
return data
|
||||
|
||||
|
||||
class TimestampField(IntegerField):
|
||||
FIELD_TYPE_NAME = 'timestamp'
|
||||
PARAMS = []
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
super(TimestampField, self).__init__(
|
||||
min_value=0, max_value=2 ** 32 - 1, **kwargs)
|
||||
|
||||
def _validate(self, value):
|
||||
try:
|
||||
return super(TimestampField, self)._validate(value)
|
||||
except exceptions.FieldValidationError as e:
|
||||
raise exceptions.FieldValidationError(
|
||||
_('Got wrong timestamp: {}').format(value))
|
||||
|
||||
|
||||
class DatetimeField(BaseField):
|
||||
INTERNAL_TYPE = datetime.datetime
|
||||
FIELD_TYPE_NAME = 'datetime'
|
||||
PARAMS = ['dt_format', 'tzinfo']
|
||||
DEFAULT_FORMAT = '%Y/%m/%d %H:%M:%S'
|
||||
|
||||
def __init__(self, dt_format=None, tzinfo=None, **kwargs):
|
||||
if dt_format is None:
|
||||
dt_format = self.DEFAULT_FORMAT
|
||||
self.dt_format = dt_format
|
||||
if isinstance(tzinfo, six.string_types):
|
||||
try:
|
||||
import pytz
|
||||
except ImportError as e:
|
||||
raise ValueError(
|
||||
_('Cant create DatetimeField instance with tzinfo {}, please install pytz and try again').format(params['tzinfo']))
|
||||
tzinfo = pytz.timezone(tzinfo)
|
||||
self.tzinfo = tzinfo
|
||||
kwargs.setdefault('strict', False)
|
||||
super(DatetimeField, self).__init__(**kwargs)
|
||||
|
||||
def _convert_type(self, value):
|
||||
# override
|
||||
if isinstance(value, six.string_types):
|
||||
if value.isdigit():
|
||||
value = int(value)
|
||||
return self.INTERNAL_TYPE.fromtimestamp(value, tz=self.tzinfo)
|
||||
else:
|
||||
dt = self.INTERNAL_TYPE.strptime(value, self.dt_format)
|
||||
if self.tzinfo:
|
||||
dt = dt.replace(tzinfo=self.tzinfo)
|
||||
return dt
|
||||
elif isinstance(value, six.integer_types):
|
||||
return self.INTERNAL_TYPE.fromtimestamp(value, tz=self.tzinfo)
|
||||
else:
|
||||
raise ValueError(_('Got wrong datetime value: {}').format(value))
|
||||
|
||||
def _validate(self, value):
|
||||
value = self._validate_type(value)
|
||||
return copy.copy(value)
|
||||
|
||||
def to_presentation(self, value):
|
||||
return value.strftime(self.dt_format)
|
||||
|
||||
def to_dict(self):
|
||||
d = super(DatetimeField, self).to_dict()
|
||||
if d['tzinfo'] is not None:
|
||||
d['tzinfo'] = force_text(d['tzinfo'])
|
||||
return d
|
||||
|
||||
def mock_data(self):
|
||||
return self.INTERNAL_TYPE.fromtimestamp(random.randint(0, 2 ** 32 - 1))
|
||||
|
||||
|
||||
class DateField(BaseField):
|
||||
INTERNAL_TYPE = datetime.date
|
||||
FIELD_TYPE_NAME = 'date'
|
||||
PARAMS = ['dt_format']
|
||||
DEFAULT_FORMAT = '%Y/%m/%d'
|
||||
|
||||
def __init__(self, dt_format=None, **kwargs):
|
||||
if dt_format is None:
|
||||
dt_format = self.DEFAULT_FORMAT
|
||||
self.dt_format = dt_format
|
||||
kwargs.setdefault('strict', False)
|
||||
super(DateField, self).__init__(**kwargs)
|
||||
|
||||
def _convert_type(self, value):
|
||||
# override
|
||||
if isinstance(value, six.string_types):
|
||||
if value.isdigit():
|
||||
value = int(value)
|
||||
return self.INTERNAL_TYPE.fromtimestamp(value)
|
||||
else:
|
||||
dt = datetime.datetime.strptime(value, self.dt_format)
|
||||
return dt.date()
|
||||
elif isinstance(value, six.integer_types):
|
||||
return self.INTERNAL_TYPE.fromtimestamp(value)
|
||||
else:
|
||||
raise ValueError()
|
||||
|
||||
def _validate(self, value):
|
||||
value = self._validate_type(value)
|
||||
return copy.copy(value)
|
||||
|
||||
def to_presentation(self, value):
|
||||
return value.strftime(self.dt_format)
|
||||
|
||||
def mock_data(self):
|
||||
return self.INTERNAL_TYPE.fromtimestamp(random.randint(0, 2 ** 32 - 1))
|
36
spider/validator/translation.py
Normal file
36
spider/validator/translation.py
Normal file
@ -0,0 +1,36 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import os
|
||||
|
||||
BASE_PATH = os.path.abspath(os.path.dirname(__file__))
|
||||
ENV_LOCALE_DIR = 'PYTHON_VALIDATOR_LOCALE'
|
||||
ENV_LANGUAGES = 'PYTHON_VALIDATOR_LANGUAGES'
|
||||
DEFAULT_LOCALE_DIR = os.path.join(BASE_PATH, 'locale')
|
||||
DOMAIN = 'python-validator'
|
||||
|
||||
domain = DOMAIN
|
||||
localedir = os.environ.get(ENV_LOCALE_DIR, DEFAULT_LOCALE_DIR)
|
||||
languages = os.environ.get(ENV_LANGUAGES)
|
||||
if languages is not None:
|
||||
try:
|
||||
languages = languages.split(',')
|
||||
except Exception:
|
||||
languages = None
|
||||
|
||||
|
||||
import gettext as _gettext
|
||||
translation = _gettext.translation(domain, localedir, languages=languages, fallback=True)
|
||||
|
||||
def get_localedir():
|
||||
return localedir
|
||||
|
||||
def gettext(s):
|
||||
return translation.gettext(s)
|
||||
|
||||
def ngettext(singular, plural, n):
|
||||
return translation.ngettext(singular, plural, n)
|
||||
|
||||
def lgettext(s):
|
||||
return translation.lgettext(s)
|
||||
|
||||
def lngettext(singular, plural, n):
|
||||
return translation.lngettext(singular, plural, n)
|
46
spider/validator/utils.py
Normal file
46
spider/validator/utils.py
Normal file
@ -0,0 +1,46 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import unicode_literals
|
||||
import six
|
||||
|
||||
def force_text(s, encoding='utf8', errors='strict'):
|
||||
if isinstance(s, six.text_type):
|
||||
return s
|
||||
if not issubclass(type(s), six.string_types):
|
||||
if six.PY3:
|
||||
if isinstance(s, bytes):
|
||||
s = six.text_type(s, encoding, errors)
|
||||
else:
|
||||
s = six.text_type(s)
|
||||
elif hasattr(s, '__unicode__'):
|
||||
s = six.text_type(s)
|
||||
else:
|
||||
s = six.text_type(bytes(s), encoding, errors)
|
||||
else:
|
||||
# Note: We use .decode() here, instead of six.text_type(s, encoding,
|
||||
# errors), so that if s is a SafeBytes, it ends up being a
|
||||
# SafeText at the end.
|
||||
s = s.decode(encoding, errors)
|
||||
return s
|
||||
|
||||
def force_bytes(s, encoding='utf-8', errors='strict'):
|
||||
if isinstance(s, six.binary_type):
|
||||
if encoding == 'utf-8':
|
||||
return s
|
||||
else:
|
||||
return s.decode('utf-8', errors).encode(encoding, errors)
|
||||
if not isinstance(s, six.string_types):
|
||||
try:
|
||||
if six.PY3:
|
||||
return six.text_type(s).encode(encoding)
|
||||
else:
|
||||
return bytes(s)
|
||||
except UnicodeEncodeError:
|
||||
return six.text_type(s).encode(encoding, errors)
|
||||
else:
|
||||
return s.encode(encoding, errors)
|
||||
|
||||
if six.PY3:
|
||||
force_str = force_text
|
||||
else:
|
||||
force_str = force_bytes
|
||||
force_unicode = force_text
|
158
spider/validator/validator.py
Normal file
158
spider/validator/validator.py
Normal file
@ -0,0 +1,158 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import unicode_literals
|
||||
import six
|
||||
from . import exceptions
|
||||
from .fields import BaseField, EMPTY_VALUE, create_field, DictField
|
||||
from .utils import force_str
|
||||
|
||||
class ValidatorMetaClass(type):
|
||||
|
||||
def __new__(cls, cls_name, bases, attrs):
|
||||
fields_map = dict()
|
||||
parent_fields_map = dict()
|
||||
for parent in bases:
|
||||
if hasattr(parent, '_FIELDS_MAP'):
|
||||
parent_fields_map.update(parent._FIELDS_MAP)
|
||||
|
||||
for name, value in six.iteritems(attrs):
|
||||
if isinstance(value, BaseField):
|
||||
fields_map[name] = value
|
||||
|
||||
for name in fields_map:
|
||||
attrs.pop(name, None)
|
||||
|
||||
parent_fields_map.update(fields_map)
|
||||
|
||||
attrs['_FIELDS_MAP'] = parent_fields_map
|
||||
|
||||
return super(ValidatorMetaClass, cls).__new__(cls, cls_name, bases, attrs)
|
||||
|
||||
|
||||
@six.add_metaclass(ValidatorMetaClass)
|
||||
class Validator(object):
|
||||
""" a data validator like Django ORM
|
||||
"""
|
||||
|
||||
def __init__(self, raw_data):
|
||||
"""
|
||||
:param raw_data: unvalidate data
|
||||
"""
|
||||
assert isinstance(raw_data, dict), '"raw_data" must be a dict, not "{}"'.format(type(raw_data).__name__)
|
||||
self.raw_data = raw_data
|
||||
self.validated_data = None
|
||||
self.errors = {}
|
||||
|
||||
def _validate(self):
|
||||
data = {}
|
||||
for name, field in six.iteritems(self._FIELDS_MAP):
|
||||
value = self.raw_data.get(name, field.get_default())
|
||||
if value is EMPTY_VALUE:
|
||||
if field.is_required():
|
||||
self.errors[name] = exceptions.FieldRequiredError()
|
||||
continue
|
||||
|
||||
"""
|
||||
# dont need to validate None
|
||||
if value is None:
|
||||
data[name] = None
|
||||
continue"""
|
||||
|
||||
try:
|
||||
validated_value = field.validate(value)
|
||||
internal_value = field.to_internal(validated_value)
|
||||
field_validator = getattr(
|
||||
self, 'validate_{}'.format(name), None)
|
||||
if field_validator and callable(field_validator):
|
||||
field_validator(internal_value)
|
||||
data[name] = internal_value
|
||||
except exceptions.FieldValidationError as e:
|
||||
self.errors[name] = e
|
||||
|
||||
if self.errors:
|
||||
return
|
||||
try:
|
||||
data = self.validate(data)
|
||||
except exceptions.ValidationError as e:
|
||||
self.errors['__data_error__'] = e
|
||||
|
||||
if not self.errors:
|
||||
self.validated_data = data
|
||||
|
||||
def is_valid(self, raise_error=False):
|
||||
self._validate()
|
||||
if raise_error and self.errors:
|
||||
raise exceptions.ValidationError(self.errors)
|
||||
return False if self.errors else True
|
||||
|
||||
def validate(self, data):
|
||||
"""
|
||||
model-level validate.
|
||||
sub-class can override this method to validate data, return modified data
|
||||
"""
|
||||
return data
|
||||
|
||||
@property
|
||||
def str_errors(self):
|
||||
errors = dict()
|
||||
for name, error in six.iteritems(self.errors):
|
||||
errors[name] = error.get_detail()
|
||||
return errors
|
||||
|
||||
@classmethod
|
||||
def to_dict(cls):
|
||||
"""
|
||||
format Validator to dict
|
||||
"""
|
||||
d = dict()
|
||||
for name, field in six.iteritems(cls._FIELDS_MAP):
|
||||
field_info = field.to_dict()
|
||||
d[name] = field_info
|
||||
return d
|
||||
|
||||
@classmethod
|
||||
def mock_data(cls):
|
||||
"""
|
||||
return random mocking data.
|
||||
mocking data will be valid in most case, but it maybe can't pass from your own `validate` method or `validator`
|
||||
"""
|
||||
mocking_data = {}
|
||||
for name, field in six.iteritems(cls._FIELDS_MAP):
|
||||
mocking_data[name] = field.mock_data()
|
||||
return mocking_data
|
||||
|
||||
def _format(self):
|
||||
fields = []
|
||||
for name, field in six.iteritems(self._FIELDS_MAP):
|
||||
fields.append('{0}:{1}'.format(name, field.FIELD_TYPE_NAME))
|
||||
fields = ','.join(fields)
|
||||
if len(fields) > 103:
|
||||
fields = fields[:100]
|
||||
return '<{0}: {1}>'.format(self.__class__.__name__, fields)
|
||||
|
||||
def __str__(self):
|
||||
return self._format()
|
||||
|
||||
def __repr__(self):
|
||||
return self._format()
|
||||
|
||||
|
||||
def create_validator(data_struct_dict, name=None):
|
||||
"""
|
||||
create a Validator instance from data_struct_dict
|
||||
|
||||
:param data_struct_dict: a dict describe validator's fields, like the dict `to_dict()` method returned.
|
||||
:param name: name of Validator class
|
||||
|
||||
:return: Validator instance
|
||||
"""
|
||||
|
||||
if name is None:
|
||||
name = 'FromDictValidator'
|
||||
attrs = {}
|
||||
for field_name, field_info in six.iteritems(data_struct_dict):
|
||||
field_type = field_info['type']
|
||||
if field_type == DictField.FIELD_TYPE_NAME and isinstance(field_info.get('validator'), dict):
|
||||
field_info['validator'] = create_validator(field_info['validator'])
|
||||
attrs[field_name] = create_field(field_info)
|
||||
name = force_str(name)
|
||||
return type(name, (Validator, ), attrs)
|
14
spider/wordpropertyconversion.py
Normal file
14
spider/wordpropertyconversion.py
Normal file
@ -0,0 +1,14 @@
|
||||
def word_property_conversion(text):
|
||||
vc = ("vi.", "vt.", "aux.")
|
||||
pronc = ("det.",)
|
||||
interjc = ("int.",)
|
||||
abbrc = ("pref.", "symb.")
|
||||
if text in vc:
|
||||
return "v."
|
||||
elif text in pronc:
|
||||
return "pron."
|
||||
elif text in interjc:
|
||||
return "interj."
|
||||
elif text in abbrc:
|
||||
return "abbr."
|
||||
return text
|
56
spider/wordspider.py
Normal file
56
spider/wordspider.py
Normal file
@ -0,0 +1,56 @@
|
||||
import logging
|
||||
from fake_useragent import UserAgent
|
||||
import httpx
|
||||
from parsel import Selector
|
||||
from htmlmin import minify
|
||||
|
||||
|
||||
def get_content(text: str) -> str:
|
||||
selector = Selector(text)
|
||||
selector = selector.css(".results-content")
|
||||
selector.css("#examples").remove()
|
||||
selector.css("#webTrans").remove()
|
||||
|
||||
selector.css("script").remove()
|
||||
selector.css("style").remove()
|
||||
selector.css("a").remove()
|
||||
|
||||
selector.css(".img-list").remove()
|
||||
return selector.get()
|
||||
|
||||
|
||||
class WordSpider:
|
||||
def __init__(self, word: str) -> None:
|
||||
self.useragent = UserAgent()
|
||||
self.headers = {"User-Agent": self.useragent.random}
|
||||
# self.headers = {}
|
||||
self.data = []
|
||||
self.word = word
|
||||
self.__html = ''
|
||||
self.url = "http://www.youdao.com/w/eng/{}/".format(word)
|
||||
self.success = None
|
||||
|
||||
def get_html(self) -> str:
|
||||
try:
|
||||
response = httpx.get(self.url, headers=self.headers)
|
||||
response.raise_for_status()
|
||||
except httpx.HTTPStatusError as e:
|
||||
logging.error("fail. " + e)
|
||||
self.success = False
|
||||
self.__html = minify(response.text, True, True)
|
||||
self.success = True
|
||||
return self.__html
|
||||
|
||||
@property
|
||||
def html(self) -> str:
|
||||
if self.__html == "":
|
||||
self.get_html()
|
||||
return self.__html
|
||||
|
||||
def parse_page(self):
|
||||
return get_content(self.html)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
word = WordSpider("a")
|
||||
print(word.parse_page())
|
Reference in New Issue
Block a user