pipelines.py 885 B

1234567891011121314151617181920212223242526272829
  1. # -*- coding: utf-8 -*-
  2. # Define your item pipelines here
  3. #
  4. # Don't forget to add your pipeline to the ITEM_PIPELINES setting
  5. # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
  6. import os
  7. from pymongo import MongoClient
  8. class XueqiuPipeline(object):
  9. mongo = MongoClient(
  10. host=os.environ.get('MONGO_HOST') or 'localhost',
  11. port=int(os.environ.get('MONGO_PORT') or 27017)
  12. )
  13. db = mongo[os.environ.get('MONGO_DB') or 'crawlab_test']
  14. col = db.get_collection(os.environ.get('CRAWLAB_COLLECTION') or 'results_xueqiu')
  15. # create indexes
  16. col.create_index('stocks')
  17. col.create_index('id')
  18. col.create_index('url')
  19. def process_item(self, item, spider):
  20. item['task_id'] = os.environ.get('CRAWLAB_TASK_ID')
  21. if self.col.find_one({'id': item['id']}) is None:
  22. self.col.save(item)
  23. return item