私はIndian 2011 Censusからデータを抽出する必要があります。私はSeleniumを使用していて、以下のような作業スクリプトを持っていますが、joblibライブラリとParallelを使ってタスクを並列化しようとしています。このスクリプトを実行するとエラーは発生しません。タスクマネージャー(Windows 10)でプロセッサがアクティブになっているのを観察しますが、このプログラムを実行して保存されたファイルは表示されず、パラレルバージョンが完成しました。どんな助けでも大歓迎です。本当にありがとう。 BTW、hereは、このプログラムの入力データセットへのリンクです。joblibを使用してSeleniumスクレープタスクを並列化するにはどうすればよいですか? (動作しない例)
あなたはドキュメントを見てみる必要があるスワップを使用せずにこれを実行するための十分なメモリを持っていると仮定すると
import time
import re
import string
import urllib.parse
import pandas
import numpy
import os
import csv
import joblib
from selenium import webdriver
from joblib import Parallel, delayed
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
path = 'C:/Users/d.wm.mclaughlin/Dropbox/research/india'
os.chdir(path)
input_df = pandas.read_excel("file_path/villages_3109_UTTAR PRADESH_12_003.xlsx", "Sheet1")
def downloadFunction(x):
driver = webdriver.PhantomJS('C:/phantomjs/bin/phantomjs.exe')
url = "url"
driver.get(url);
selected_state = str(input_df['state_no'][x])
selected_district = str(input_df['dist_no'][x])
selected_block = str(input_df['block_no'][x]).zfill(3)
selected_pan = str(input_df['pan'][x]).zfill(4)
selected_state_name = input_df['state'][x]
selected_dist_name = input_df['district'][x]
selected_block_name = input_df['block'][x]
selected_pan_name = input_df['village'][x]
select = Select(driver.find_element_by_css_selector("#ddl_state"))
select.select_by_value(selected_state)
distSelect = Select(driver.find_element_by_css_selector("#ddl_dist"))
distSelect.select_by_value(selected_district)
blkSelect = Select(driver.find_element_by_css_selector("#ddl_blk"))
blkSelect.select_by_value(selected_block)
panSelect = Select(driver.find_element_by_css_selector("#ddl_pan"))
panSelect.select_by_value(selected_pan)
button_list = ['#RadioButtonList1_0', '#RadioButtonList1_1', '#RadioButtonList1_2']
button_names = ['auto_inclusion', 'auto_exclusion', 'other']
for b in range(0,1):
selected_button = button_list[b]
selected_button_name = button_names[b]
driver.find_element_by_css_selector(selected_button).click()
driver.find_element_by_css_selector('#Button1').click()
if('No Record Found !!!' in driver.page_source):
print('No Record Found !!!')
else:
ae = driver.find_element_by_css_selector('#form1 > div:nth-child(4) > center:nth-child(2) > table > tbody > tr:nth-child(3) > td:nth-child(1)').text
if(ae == ''): ae = 0
ai = driver.find_element_by_css_selector('#form1 > div:nth-child(4) > center:nth-child(2) > table > tbody > tr:nth-child(3) > td:nth-child(2)').text
if(ai == ''): ai = 0
oth = driver.find_element_by_css_selector('#form1 > div:nth-child(4) > center:nth-child(2) > table > tbody > tr:nth-child(3) > td:nth-child(3)').text
if(oth == ''): oth = 0
dep = driver.find_element_by_css_selector('#form1 > div:nth-child(4) > center:nth-child(2) > table > tbody > tr:nth-child(3) > td:nth-child(4)').text
if(dep == ''): dep = 0
ae = int(ae)
ai = int(ai)
oth = int(oth)
dep = int(dep)
ai_dep = ai + dep
records = [ai_dep, ae, oth]
selected_record = records[b]
table_number = round(selected_record/45)
table_numbers = list(range(1, (1+(table_number)*3), 3))
data = []
for data_tab in table_numbers:
table_address = '#Div1 > table:nth-child(' + str(data_tab) + ')'
#print(table_address)
for tr in driver.find_elements_by_css_selector(table_address):
# CONTINUE FROM HERE!!!
#print(tr == driver.find_element_by_css_selector("#Div1 > table:nth-child(" + str(data_tab) + ") > tbody > tr:nth-child(1)"))
#"#Div1 > table:nth-child(" + str(data_tab) + ") > tbody > tr:nth-child(2)"
#"#Div1 > table:nth-child(" + str(data_tab) + ") > tbody > tr:nth-child(3)"
tds = tr.find_elements_by_tag_name('td')
if tds:
data.append([td.text for td in tds])
#newArray = numpy.array(data)
for listItem in range(0,len(data)):
if(listItem > 0):
data[listItem] = data[listItem][18:len(data[listItem])]
#print(len(data[listItem]))
flat_data = [item for sublist in data for item in sublist]
newArray = numpy.array(flat_data)
dataRows = int(numpy.array(flat_data).size/9)
rowsTimesColumns = (dataRows * 9)
test = pandas.DataFrame(newArray.reshape(dataRows,9), columns=['no', 'hh_name', 'gender', 'age', 'sc', 'fm_name', 'depriv_count', 'ai_d_code', 'total_mem'])
file_path = 'C:/Users/d.wm.mclaughlin/Dropbox/research/lpg_india/data/secc/secc' + '_' + selected_state + '_' + '_' + selected_district + '_' + '_' + selected_block + '_' + '_' + selected_pan + '_' + '_' + selected_button_name + '.xlsx'
test.to_excel(file_path, 'Sheet1')
return print(x);
tester = Parallel(n_jobs=3)(delayed(downloadFunction)(in_val) for in_val in range(1, 10))
どのくらいのメモリを使用していますか、お使いのマシンにはどれくらいの容量がありますか? –