55 lines
1.3 KiB
Python
55 lines
1.3 KiB
Python
from typing import Sequence
|
|
import time
|
|
import logging
|
|
|
|
from sqlalchemy.orm import Session
|
|
from sqlalchemy.sql.expression import update
|
|
|
|
from wordspider import WordSpider
|
|
from models import WordData
|
|
import models
|
|
|
|
|
|
AMOUNT = 9500
|
|
|
|
|
|
def get_not_retrieve_word_list(session: Session) -> Sequence:
|
|
queryset = session.query(WordData).filter_by(has_retrieve=False)
|
|
return queryset
|
|
|
|
|
|
def get_data(session: Session, word_list: Sequence) -> None:
|
|
for word in word_list:
|
|
spider = WordSpider(word)
|
|
text = spider.parse_page()
|
|
if spider.success:
|
|
print("*", end="")
|
|
else:
|
|
print("F", end="")
|
|
continue
|
|
session.execute(
|
|
update(WordData)
|
|
.where(WordData.word == word)
|
|
.values(html=text, has_retrieve=True)
|
|
)
|
|
session.commit()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
starttime = time.time()
|
|
logging.basicConfig(filename="spider.log")
|
|
with models.Session() as session:
|
|
ans = get_not_retrieve_word_list(session)
|
|
|
|
if ans.count() < AMOUNT:
|
|
queryset = ans
|
|
else:
|
|
queryset = ans[0:AMOUNT]
|
|
|
|
word_list = map(lambda item: item.word, queryset)
|
|
|
|
get_data(session, word_list)
|
|
|
|
endtime = time.time()
|
|
print(endtime - starttime)
|