使用以下pipeline即可:
1 2 3 4 5 6 7 8 9 10 11 12 13 |
from scrapy.exceptions import DropItem class DuplicatesPipeline(object): def __init__(self): self.ids_seen = set() def process_item(self, item, spider): if item['id'] in self.ids_seen: raise DropItem("Duplicate item found: %s" % item) else: self.ids_seen.add(item['id']) return item |
【摘自:】https://docs.scrapy.org/en/latest/topics/item-pipeline.html#duplicates-filter