import logging from models import Session, WordData, Word from serializers import WordAddSerializer, MeaningAddSerializer from renderercontents import renderer_word, renderer_meaningslist, has_value_to_render def create_word_meaning(text, session): word_dict = renderer_word(text) word_serializer = WordAddSerializer(word_dict, session) flag = False try: flag = word_serializer.is_valid(True) except Exception as e: logging.error("msg: {} data: {}".format(e, word_dict)) if not flag: return # fail validation if session.query(Word).filter_by(spelling=word_dict['spelling']).count() > 0: return # repeat word = word_serializer.save() if word.spelling is None: # word is null logging.error("word spelling is null word_dict: {}\n{}".format(word_dict, text)) try: meaning_list = renderer_meaningslist(text) except Exception as e: logging.error("msg: {} word: {}\n{}".format(e, word.spelling, text)) raise e for meaning in meaning_list: meaning_serializer = MeaningAddSerializer(meaning, session, word) flag = False try: flag = meaning_serializer.is_valid(True) except Exception as e: logging.error("msg: {} word: {} data: {}".format(e, word.spelling, meaning)) meaning_serializer.save() if __name__ == "__main__": logging.basicConfig(filename="spider.log") with Session() as session: queryset = session.query(WordData).filter_by(has_retrieve=True) texts = map(lambda word: word.html, queryset) for text in texts: if has_value_to_render(text): create_word_meaning(text, session) session.commit()