from typing import Sequence import time import logging from sqlalchemy.orm import Session from sqlalchemy.sql.expression import update from wordspider import WordSpider from models import WordData import models AMOUNT = 9500 def get_not_retrieve_word_list(session: Session) -> Sequence: queryset = session.query(WordData).filter_by(has_retrieve=False) return queryset def get_data(session: Session, word_list: Sequence) -> None: for word in word_list: spider = WordSpider(word) text = spider.parse_page() if spider.success: print("*", end="") else: print("F", end="") continue session.execute( update(WordData) .where(WordData.word == word) .values(html=text, has_retrieve=True) ) session.commit() if __name__ == "__main__": starttime = time.time() logging.basicConfig(filename="spider.log") with models.Session() as session: ans = get_not_retrieve_word_list(session) if ans.count() < AMOUNT: queryset = ans else: queryset = ans[0:AMOUNT] word_list = map(lambda item: item.word, queryset) get_data(session, word_list) endtime = time.time() print(endtime - starttime)