私は、スクラップしたデータをmysqlデータベースに保存するためのパイプラインを作成しています。クモがターミナルで走るとき、それは完全に働く。パイプラインさえ開いています。ただし、データはデータベースに送信されていません。どんな助けにも感謝! :)治療パイプラインがデータをMySQLに挿入しない
は、ここでパイプラインコードです:
import scrapy
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem
from scrapy.http import Request
import datetime
import logging
import MySQLdb
import MySQLdb.cursors
from scrapy.exceptions import DropItem
from en_movie.items import EnMovie
class DuplicatesPipeline(object):
def __init__(self):
self.ids_seen = set()
def process_item(self, item, spider):
if item['image_urls'] in self.ids_seen:
raise DropItem("Duplicate item found: %s" % item)
else:
self.ids_seen.add(item['image_urls'])
return item
class MyImagesPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
for image_url in item['image_urls']:
yield scrapy.Request(image_url)
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
item['image_paths'] = ', '.join(image_paths)
return item
class EnMovieStorePipeline(object):
def __init__(self):
self.conn = MySQLdb.connect(host="localhost", user="root", passwd="pass", db="passdb", charset="utf8", use_unicode=True)
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
cursor.execute("""SELECT * FROM dmmactress_enmovielist WHERE Content_ID = %s and release_date = %s and running_time = %s and Actress = %s and Series = %s and Studio = %s and Director = %s and Label = %s and image_paths = %s and image_urls = %s""",
(item['Content_ID'][0].encode('utf-8'), item['release_date'][0].encode('utf-8'), item['running_time'][0].encode('utf-8'), item['Actress'][0].encode('utf-8'), item['Series'][0].encode('utf-8'), item['Studio'][0].encode('utf-8'), item['Director'][0].encode('utf-8'), item['Label'][0].encode('utf-8'), item['image_urls'][0].encode('utf-8')))
result = self.cursor.fetchone()
if result:
print("data already exist")
else:
try:
cursor.execute("""INSERT INTO dmmactress_enmovielist(Content_ID, release_date, running_time, Actress, Series, Studio, Director, Label, image_paths, image_urls) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""",
(item['Content_ID'][0].encode('utf-8'), item['release_date'][0].encode('utf-8'), item['running_time'][0].encode('utf-8'), item['Actress'][0].encode('utf-8'), item['Series'][0].encode('utf-8'), item['Studio'][0].encode('utf-8'), item['Director'][0].encode('utf-8'), item['Label'][0].encode('utf-8'), item['image_urls'][0].encode('utf-8')))
self.conn.commit()
except MySQLdb.Error as e:
print ("Error %d: %s" % (e.args[0], e.args[1]))
return item
EDIT:
def parse_item(self, response):
for sel in response.xpath('//*[@id="contents"]/div[10]/section/section[1]/section[1]'):
item = EnMovie()
Content_ID = sel.xpath('normalize-space(div[2]/dl/dt[contains (.,"Content ID:")]/following-sibling::dd[1]/text())').extract()
item['Content_ID'] = Content_ID[0].encode('utf-8')
release_date = sel.xpath('normalize-space(div[2]/dl[1]/dt[contains (.,"Release Date:")]/following-sibling::dd[1]/text())').extract()
item['release_date'] = release_date[0].encode('utf-8')
running_time = sel.xpath('normalize-space(div[2]/dl[1]/dt[contains (.,"Runtime:")]/following-sibling::dd[1]/text())').extract()
item['running_time'] = running_time[0].encode('utf-8')
Series = sel.xpath('normalize-space(div[2]/dl[2]/dt[contains (.,"Series:")]/following-sibling::dd[1]/text())').extract()
item['Series'] = Series[0].encode('utf-8')
Studio = sel.xpath('normalize-space(div[2]/dl[2]/dt[contains (.,"Studio:")]/following-sibling::dd[1]/a/text())').extract()
item['Studio'] = Studio[0].encode('utf-8')
Director = sel.xpath('normalize-space(div[2]/dl[2]/dt[contains (.,"Director:")]/following-sibling::dd[1]/text())').extract()
item['Director'] = Director[0].encode('utf-8')
Label = sel.xpath('normalize-space(div[2]/dl[2]/dt[contains (.,"Label:")]/following-sibling::dd[1]/text())').extract()
item['Label'] = Label[0].encode('utf-8')
item['image_urls'] = sel.xpath('div[1]/img/@src').extract()
actresses = sel.xpath("//*[@itemprop='actors']//*[@itemprop='name']/text()").extract()
actress = [x.strip() for x in actresses]
item['Actress'] = ", ".join(actress)
yield item
...あなたは上記のコードを使用している場合だけでなく、あなたのsettings.pyファイルを更新することを忘れないでください!あなたの 'item'の様子を見せてもらえますか? – Harrison
こんにちは。ハリソン私は何の誤りもなかった。だから、私は本当に自分のパイプラインに何が問題なのか分からない。 (私はあなたの返事のために自分の投稿を変更しました)ありがとうございます。 – Jin
ほとんどの 'item'フィールドは' image_urls'を除いて文字列です。 'item ['image_urls']'以外のitem ['Content_ID'] '、' item ['release_date'] 'などの後に' [0] 'を削除してみてください。 – Harrison