私はtwitterデータベースで感情分析を行うためにコードをオンラインにしました。私はそれを実行しようとしたが、それは印刷のための最初のエラーで私に与えた。これは、新しいバージョンのpythonが印刷を行う方法を変更したことを理解した。誰かがPythonで作業していて、私が間違っている場所を見るためにワシの目を持っていると、配列にデータが埋め込まれていないことを示すエラーが発生しています。感情分析コード(word2vec)が私のpythonバージョン(語彙が組み込まれていない)で正しく動作していません
import numpy as np
from copy import deepcopy
from string import punctuation
from random import shuffle
import chardet
from sklearn.manifold import TSNE
from sklearn.preprocessing import scale
import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook
import gensim
from gensim.models.word2vec import Word2Vec
LabeledSentence = gensim.models.doc2vec.LabeledSentence
import pandas as pd
pd.options.mode.chained_assignment = None
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from nltk.tokenize import TweetTokenizer
tokenizer = TweetTokenizer()
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
def ingest(filename):
with open(filename, 'rb') as f:
result = chardet.detect(f.read())
data = pd.read_csv(filename, encoding=result['encoding'])
data.drop(['ItemID', 'Date', 'Blank', 'SentimentSource'], axis=1, inplace=True)
data = data[data.Sentiment.isnull() == False]
data['Sentiment'] = data['Sentiment'].map({4:1, 0:0})
data = data[data['SentimentText'].isnull() == False]
data.reset_index(inplace=True)
data.drop('index', axis=1, inplace=True)
print('dataset loaded with shape {}', format(data.shape))
return data
def tokenize(tweet):
try:
tweet = unicode(tweet.decode('utf-8').lower())
tokens = tokenizer.tokenize(tweet)
tokens = filter(lambda t: not t.startswith('@'), tokens)
tokens = filter(lambda t: not t.startswith('#'), tokens)
tokens = filter(lambda t: not t.startswith('http'), tokens)
return tokens
except:
return 'NC'
def postprocess(data, n=100):
data = data.head(n)
data['tokens'] = data['SentimentText'].progress_map(tokenize)
data = data[data.tokens != 'NC']
data.reset_index(inplace=True)
data.drop('index', inplace=True, axis=1)
return data
def labelizeTweets(tweets, label_type):
labelized = []
for i,v in enumerate(tweets):
label = '%s_%s'%(label_type,i)
labelized.append(LabeledSentence(v, [label]))
print(":::::::::::::::::::::::::")
return labelized
def labelizeTweets(tweets, label_type):
labelized = []
for i,v in tqdm(enumerate(tweets)):
label = '%s_%s'%(label_type,i)
labelized.append(LabeledSentence(v, [label]))
return labelized
def buildWordVector(tokens, size):
vec = np.zeros(size).reshape((1, size))
count = 0.
for word in tokens:
try:
vec += tweet_w2v[word].reshape((1, size)) * tfidf[word]
count += 1.
except KeyError:
continue
if count != 0:
vec /= count
return vec
if __name__ == '__main__':
filename = './training.csv'
#n = 1000000
n = 100
n_dim = 200
data = ingest(filename)
#data = data.head(5)
data = postprocess(data, n)
x_train, x_test, y_train, y_test = train_test_split(np.array(data.head(n).tokens), np.array(data.head(n).Sentiment), test_size=0.2)
print("training length X", len(x_train))
print("training length Y", len(y_train))
x_train = labelizeTweets(x_train, 'TRAIN')
x_test = labelizeTweets(x_test, 'TEST')
print("jljkjkjlkjlj", len(x_train))
tweet_w2v = Word2Vec(size=n_dim, min_count=10)
#tweet_w2v.build_vocab([x.words for x in tqdm(x_train)])
tweet_w2v.build_vocab([x.words for x in x_train])
#tweet_w2v.train([x.words for x in tqdm(x_train)],total_examples=tweet_w2v.corpus_count, epochs=tweet_w2v.iter)
tweet_w2v.train([x.words for x in x_train],total_examples=tweet_w2v.corpus_count, epochs=tweet_w2v.iter)
print(tweet_w2v.most_similar('good'))
if True:
print('building tf-idf matrix ...')
vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=10)
matrix = vectorizer.fit_transform([x.words for x in x_train])
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
print('vocab size :', len(tfidf))
train_vecs_w2v = np.concatenate([buildWordVector(z, n_dim) for z in tqdm(map(lambda x: x.words, x_train))])
train_vecs_w2v = scale(train_vecs_w2v)
test_vecs_w2v = np.concatenate([buildWordVector(z, n_dim) for z in tqdm(map(lambda x: x.words, x_test))])
test_vecs_w2v = scale(test_vecs_w2v)
model = Sequential()
model.add(Dense(32, activation='relu', input_dim=200))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop',
loss='binary_crossentropy',
metrics=['accuracy'])
model.fit(train_vecs_w2v, y_train, epochs=20, batch_size=32, verbose=2)
score = model.evaluate(test_vecs_w2v, y_test, batch_size=128, verbose=2)
print (score[1])
output_notebook()
plot_tfidf = bp.figure(plot_width=700, plot_height=600, title="A map of 10000 word vectors",
tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
x_axis_type=None, y_axis_type=None, min_border=1)
word_vectors = [tweet_w2v[w] for w in tweet_w2v.wv.vocab.keys()[:5000]]
tsne_model = TSNE(n_components=2, verbose=1, random_state=0)
tsne_w2v = tsne_model.fit_transform(word_vectors)
tsne_df = pd.DataFrame(tsne_w2v, columns=['x', 'y'])
tsne_df['words'] = tweet_w2v.wv.vocab.keys()[:5000]
plot_tfidf.scatter(x='x', y='y', source=tsne_df)
hover = plot_tfidf.select(dict(type=HoverTool))
hover.tooltips={"word": "@words"}
show(plot_tfidf)
これは私が同じコードと同じ問題があった
C:\Users\lenovo\AppData\Local\Programs\Python\Python35\lib\site-packages\gensim\utils.py:860: UserWarning: detected Windows; aliasing chunkize to chunkize_serial
warnings.warn("detected Windows; aliasing chunkize to chunkize_serial")
dataset loaded with shape {} (505, 2)
progress-bar: 100%|##########################################################################| 505/505 [00:00<?, ?it/s]
training length X 0
training length Y 0
0it [00:00, ?it/s]
0it [00:00, ?it/s]
jljkjkjlkjlj 0
Traceback (most recent call last):
File "Sentiment_Analysis.py", line 127, in <module>
tweet_w2v.train([x.words for x in x_train],total_examples=tweet_w2v.corpus_count, epochs=tweet_w2v.iter)
File "C:\Users\lenovo\AppData\Local\Programs\Python\Python35\lib\site-packages\gensim\models\word2vec.py", line 951, in train
raise RuntimeError("you must first build vocabulary before training the model")
RuntimeError: you must first build vocabulary before training the model
これを試すことができますか? :tweet_w2v = Word2Vec(文章= [x_trainのxのx.words、サイズ= n_dim、min_count = 10) –
またはそれ以上 - build_vocabを呼び出した後にtweet_w2v.wv.vocabを印刷/検査できますか?あなたはスタックトレースを持っていますか? –
私は印刷物( "トレーニング長X"、len(x_train))を返して0を返すので、問題はコード内にあると思います。 – user573014