2017-05-20 2 views



data = pd.read_csv(open('myfile.csv'),sep=';') 

target = data["label"] 
del data["label"] 

data.sentence = data.sentence.str.lower() # All strings in data frame to lowercase 

for line in data.sentence: 
    Line_new= nltk.pos_tag(nltk.word_tokenize(line)) 


[('together', 'RB'), ('with', 'IN'), ('the', 'DT'), ('6th', 'CD'), ('battalion', 'NN'), ('of', 'IN'), ('the', 'DT')] 


UPDATE: 所望の出力がある

I was there  1 1  1  0  0 
He came there  0 0  1  1  1 


"A child who is exclusively or predominantly oral (using speech for communication) can experience social isolation from his or her hearing peers, particularly if no one takes the time to explicitly teach them social skills that other children acquire independently by virtue of having normal hearing.";"certain" 
"Preliminary Discourse to the Encyclopedia of Diderot";"certain" 
"d'Alembert claims that it would be ignorant to perceive that everything could be known about a particular subject.";"certain" 
"However, as the overemphasis on parental influence of psychodynamics theory has been strongly criticized in the previous century, modern psychologists adopted interracial contact as a more important determinant than childhood experience on shaping people’s prejudice traits (Stephan & Rosenfield, 1978).";"uncertain" 
"this can also be summarized as a distinguish behaviour on the peronnel level";"uncertain" 

あなたはマトリックスとはどういう意味ですか?希望する出力の例を指定してもらえますか? – alvas


@alvasありがとうございました!最後に望ましい出力を追加しました – ZverArt


あなたの '' myfile.csv''はどのように見えますか?あなたは 'data.head()'を印刷できますか? – alvas




>>> import pandas as pd 
>>> df = pd.read_csv('myfile.csv', delimiter=';') 
>>> df.columns = ['sent', 'tag'] 
>>> df['sent'] 
0 Preliminary Discourse to the Encyclopedia of D... 
1 d'Alembert claims that it would be ignorant to... 
2 However, as the overemphasis on parental influ... 
3 this can also be summarized as a distinguish b... 
Name: sent, dtype: object 
>>> df['tag'] 
0  certain 
1  certain 
2 uncertain 
3 uncertain 


>>> from nltk import word_tokenize, pos_tag 
>>> from functools import partial 
>>> tok_and_tag = lambda x: pos_tag(word_tokenize(x)) 
>>> df['sent'][0] 
'Preliminary Discourse to the Encyclopedia of Diderot' 
>>> tok_and_tag(df['sent'][0]) 
[('Preliminary', 'JJ'), ('Discourse', 'NNP'), ('to', 'TO'), ('the', 'DT'), ('Encyclopedia', 'NNP'), ('of', 'IN'), ('Diderot', 'NNP')] 


>>> df['sent'].apply(tok_and_tag) 
0 [(Preliminary, JJ), (Discourse, NNP), (to, TO)... 
1 [(d'Alembert, NN), (claims, NNS), (that, IN), ... 
2 [(However, RB), (,, ,), (as, IN), (the, DT), (... 
3 [(this, DT), (can, MD), (also, RB), (be, VB), ... 
Name: sent, dtype: object 


>>> df['sent'].apply(str.lower) 
0 preliminary discourse to the encyclopedia of d... 
1 d'alembert claims that it would be ignorant to... 
2 however, as the overemphasis on parental influ... 
3 this can also be summarized as a distinguish b... 
Name: sent, dtype: object 

>>> df['lower_sent'] = df['sent'].apply(str.lower) 

>>> df['lower_sent'].apply(tok_and_tag) 
0 [(preliminary, JJ), (discourse, NN), (to, TO),... 
1 [(d'alembert, NN), (claims, NNS), (that, IN), ... 
2 [(however, RB), (,, ,), (as, IN), (the, DT), (... 
3 [(this, DT), (can, MD), (also, RB), (be, VB), ... 
Name: lower_sent, dtype: object 


>>> df['lower_sent'] 
0 preliminary discourse to the encyclopedia of d... 
1 d'alembert claims that it would be ignorant to... 
2 however, as the overemphasis on parental influ... 
3 this can also be summarized as a distinguish b... 
Name: lower_sent, dtype: object 

>>> df['lower_sent'].apply(tok_and_tag) 
0 [(preliminary, JJ), (discourse, NN), (to, TO),... 
1 [(d'alembert, NN), (claims, NNS), (that, IN), ... 
2 [(however, RB), (,, ,), (as, IN), (the, DT), (... 
3 [(this, DT), (can, MD), (also, RB), (be, VB), ... 
Name: lower_sent, dtype: object 

>>> df['tagged_sent'] = df['lower_sent'].apply(tok_and_tag) 

>>> tokens, tags = zip(*chain(*df['tagged_sent'].tolist())) 

>>> tags 
('JJ', 'NN', 'TO', 'DT', 'NN', 'IN', 'NN', 'NN', 'NNS', 'IN', 'PRP', 'MD', 'VB', 'JJ', 'TO', 'VB', 'IN', 'NN', 'MD', 'VB', 'VBN', 'IN', 'DT', 'JJ', 'NN', '.', 'RB', ',', 'IN', 'DT', 'NN', 'IN', 'JJ', 'NN', 'IN', 'NNS', 'NN', 'VBZ', 'VBN', 'RB', 'VBN', 'IN', 'DT', 'JJ', 'NN', ',', 'JJ', 'NNS', 'VBD', 'JJ', 'NN', 'IN', 'DT', 'RBR', 'JJ', 'NN', 'IN', 'NN', 'NN', 'IN', 'VBG', 'JJ', 'NN', 'NNS', '(', 'NN', 'CC', 'NN', ',', 'CD', ')', '.', 'DT', 'MD', 'RB', 'VB', 'VBN', 'IN', 'DT', 'JJ', 'NN', 'IN', 'DT', 'NNS', 'NN') 

>>> set(tags) 
{'CC', 'VB', ')', 'NNS', ',', 'JJ', 'VBZ', 'DT', 'NN', 'PRP', 'RBR', 'TO', 'VBD', '(', 'VBN', '.', 'MD', 'IN', 'RB', 'VBG', 'CD'} 
>>> possible_tags = sorted(set(tags)) 
>>> possible_tags 
['(', ')', ',', '.', 'CC', 'CD', 'DT', 'IN', 'JJ', 'MD', 'NN', 'NNS', 'PRP', 'RB', 'RBR', 'TO', 'VB', 'VBD', 'VBG', 'VBN', 'VBZ'] 

>>> possible_tags_counter = Counter({p:0 for p in possible_tags}) 
>>> possible_tags_counter 
Counter({'NNS': 0, 'VBZ': 0, 'DT': 0, '(': 0, 'JJ': 0, 'VBD': 0, ')': 0, 'RB': 0, 'VBG': 0, 'RBR': 0, 'VB': 0, 'IN': 0, 'CC': 0, ',': 0, 'PRP': 0, 'CD': 0, 'VBN': 0, '.': 0, 'MD': 0, 'NN': 0, 'TO': 0}) 


>>> df['tagged_sent'].apply(lambda x: Counter(list(zip(*x))[1])) 
0 {'NN': 3, 'IN': 1, 'TO': 1, 'DT': 1, 'JJ': 1} 
1 {'NN': 3, 'VB': 3, 'PRP': 1, 'TO': 1, 'DT': 1,... 
2 {')': 1, 'JJ': 6, 'NN': 11, 'CC': 1, 'NNS': 3,... 
3 {'DT': 3, 'VB': 1, 'NN': 2, 'VBN': 1, 'NNS': 1... 
Name: tagged_sent, dtype: object 

>>> df['pos_counts'] = df['tagged_sent'].apply(lambda x: Counter(list(zip(*x))[1])) 

>>> df['pos_counts'] 
0 {'NN': 3, 'IN': 1, 'TO': 1, 'DT': 1, 'JJ': 1} 
1 {'NN': 3, 'VB': 3, 'PRP': 1, 'TO': 1, 'DT': 1,... 
2 {')': 1, 'JJ': 6, 'NN': 11, 'CC': 1, 'NNS': 3,... 
3 {'DT': 3, 'VB': 1, 'NN': 2, 'VBN': 1, 'NNS': 1... 
Name: pos_counts, dtype: object 

# Now we can add in the POS that don't appears in the sentence with 0 counts: 

>>> def add_pos_with_zero_counts(counter, keys_to_add): 
...  for k in keys_to_add: 
...   counter[k] = counter.get(k, 0) 
...  return counter 
>>> df['pos_counts'].apply(lambda x: add_pos_with_zero_counts(x, possible_tags)) 
0 {'VB': 0, 'IN': 1, 'PRP': 0, 'DT': 1, 'CC': 0,... 
1 {'VB': 3, ')': 0, 'DT': 1, 'CC': 0, 'RB': 0, '... 
2 {'VB': 0, ')': 1, 'JJ': 6, 'NN': 11, 'CC': 1, ... 
3 {'VB': 1, 'IN': 2, 'PRP': 0, 'NN': 2, 'CC': 0,... 
Name: pos_counts, dtype: object 

>>> df['pos_counts_with_zero'] = df['pos_counts'].apply(lambda x: add_pos_with_zero_counts(x, possible_tags)) 


>>> df['pos_counts_with_zero'].apply(lambda x: [count for tag, count in sorted(x.most_common())]) 
0 [0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 3, 0, 0, 0, 0, ... 
1 [0, 0, 0, 1, 0, 0, 1, 3, 2, 2, 3, 1, 1, 0, 0, ... 
2 [1, 1, 3, 1, 1, 1, 3, 7, 6, 0, 11, 3, 0, 2, 1,... 
3 [0, 0, 0, 0, 0, 0, 3, 2, 1, 1, 2, 1, 0, 1, 0, ... 
Name: pos_counts_with_zero, dtype: object 

>>> df['sent_vector'] = df['pos_counts_with_zero'].apply(lambda x: [count for tag, count in sorted(x.most_common())]) 


>>> df2 
    () , . CC CD DT IN JJ MD ... NNS PRP RB RBR TO VB VBD \ 
0 0 0 0 0 0 0 1 1 1 0 ...  0 0 0 0 1 0 0 
1 0 0 0 1 0 0 1 3 2 2 ...  1 1 0 0 1 3 0 
2 1 1 3 1 1 1 3 7 6 0 ...  3 0 2 1 0 0 1 
3 0 0 0 0 0 0 3 2 1 1 ...  1 0 1 0 0 1 0 

0 0 0 0 
1 0 1 0 
2 1 2 1 
3 0 1 0 

[4 rows x 21 columns] 

>>> df2 = pd.DataFrame(df['sent_vector'].tolist) 
>>> df2.columns = sorted(possible_tags) 


from collections import Counter 
from itertools import chain 

import pandas as pd 

from nltk import word_tokenize, pos_tag 

df = pd.read_csv('myfile.csv', delimiter=';') 
df.columns = ['sent', 'tag'] 

tok_and_tag = lambda x: pos_tag(word_tokenize(x)) 

df['lower_sent'] = df['sent'].apply(str.lower) 
df['tagged_sent'] = df['lower_sent'].apply(tok_and_tag) 

possible_tags = sorted(set(list(zip(*chain(*df['tagged_sent'])))[1])) 

def add_pos_with_zero_counts(counter, keys_to_add): 
    for k in keys_to_add: 
     counter[k] = counter.get(k, 0) 
    return counter 

# Detailed steps. 
df['pos_counts'] = df['tagged_sent'].apply(lambda x: Counter(list(zip(*x))[1])) 
df['pos_counts_with_zero'] = df['pos_counts'].apply(lambda x: add_pos_with_zero_counts(x, possible_tags)) 
df['sent_vector'] = df['pos_counts_with_zero'].apply(lambda x: [count for tag, count in sorted(x.most_common())]) 

# All in one. 
df['sent_vector'] = df['tagged_sent'].apply(lambda x: 
    [count for tag, count in sorted(

df2 = pd.DataFrame(df['sent_vector'].tolist()) 
df2.columns = possible_tags 

ソリューションをありがとう、より重要なのは、良い説明です! – ZverArt


回答がうまくいってうれしい; P – alvas


こんにちは!最後の1つの質問 - ラベル値(特定/不確実)を含むタグ列を出力に追加しようとしましたが、適切な方法を見つけられませんでした。何かお勧めしますか? – ZverArt
