osdict_project/spider/action_getwebdata.py

55 lines
1.3 KiB
Python

from typing import Sequence
import time
import logging
from sqlalchemy.orm import Session
from sqlalchemy.sql.expression import update
from wordspider import WordSpider
from models import WordData
import models
AMOUNT = 9500
def get_not_retrieve_word_list(session: Session) -> Sequence:
queryset = session.query(WordData).filter_by(has_retrieve=False)
return queryset
def get_data(session: Session, word_list: Sequence) -> None:
for word in word_list:
spider = WordSpider(word)
text = spider.parse_page()
if spider.success:
print("*", end="")
else:
print("F", end="")
continue
session.execute(
update(WordData)
.where(WordData.word == word)
.values(html=text, has_retrieve=True)
)
session.commit()
if __name__ == "__main__":
starttime = time.time()
logging.basicConfig(filename="spider.log")
with models.Session() as session:
ans = get_not_retrieve_word_list(session)
if ans.count() < AMOUNT:
queryset = ans
else:
queryset = ans[0:AMOUNT]
word_list = map(lambda item: item.word, queryset)
get_data(session, word_list)
endtime = time.time()
print(endtime - starttime)