2017-01-11 6 views
3

私はこのlink which is nicely colour codedからいくつかのエラーを修正するために4つのマイナーな変更を加えてコードをまとめました。私はまた2つの以前のフォーラムからいくつかのコードを使用しました。文書内の文の意味の類似性を見つける

コードでは、テキスト全体にわたる連続する文の間の意味的類似性を計算し、次にこのようにして得られたすべての類似性値を表示します。

「黄色ドア。」、「赤ハンマー」0.65

「森の中ピンクキツネ。」、「コマンダキツネが青色です」。 0.32

ここにコードがあります。

ALPHA = 0.2 
BETA = 0.45 
ETA = 0.4 
PHI = 0.2 
DELTA = 0.85 

brown_freqs = dict() 
N = 0 

######################### word similarity ########################## 

def get_best_synset_pair(word_1, word_2): 
    """ 
    Choose the pair with highest path similarity among all pairs. 
    Mimics pattern-seeking behavior of humans. 
    """ 
    max_sim = -1.0 
    synsets_1 = wn.synsets(word_1) 
    synsets_2 = wn.synsets(word_2) 
    if len(synsets_1) == 0 or len(synsets_2) == 0: 
     return None, None 
    else: 
     max_sim = -1.0 
     best_pair = None, None 
     for synset_1 in synsets_1: 
      for synset_2 in synsets_2: 
       sim = wn.path_similarity(synset_1, synset_2) 
       if sim > max_sim: 
        max_sim = sim 
        best_pair = synset_1, synset_2 
     return best_pair 

def length_dist(synset_1, synset_2): 

    l_dist = sys.maxint 
    if synset_1 is None or synset_2 is None: 
     return 0.0 
    if synset_1 == synset_2: 
     # if synset_1 and synset_2 are the same synset return 0 
     l_dist = 0.0 
    else: 
     wset_1 = set([str(x.name()) for x in synset_1.lemmas()])   
     wset_2 = set([str(x.name()) for x in synset_2.lemmas()]) 
     if len(wset_1.intersection(wset_2)) > 0: 
      # if synset_1 != synset_2 but there is word overlap, return 1.0 
      l_dist = 1.0 
     else: 
      # just compute the shortest path between the two 
      l_dist = synset_1.shortest_path_distance(synset_2) 
      if l_dist is None: 
       l_dist = 0.0 
    # normalize path length to the range [0,1] 
    return math.exp(-ALPHA * l_dist) 

def hierarchy_dist(synset_1, synset_2): 

    h_dist = sys.maxint 
    if synset_1 is None or synset_2 is None: 
     return h_dist 
    if synset_1 == synset_2: 
     # return the depth of one of synset_1 or synset_2 
     h_dist = max([x[1] for x in synset_1.hypernym_distances()]) 
    else: 
     # find the max depth of least common subsumer 
     hypernyms_1 = {x[0]:x[1] for x in synset_1.hypernym_distances()} 
     hypernyms_2 = {x[0]:x[1] for x in synset_2.hypernym_distances()} 
     lcs_candidates = set(hypernyms_1.keys()).intersection(
      set(hypernyms_2.keys())) 
     if len(lcs_candidates) > 0: 
      lcs_dists = [] 
      for lcs_candidate in lcs_candidates: 
       lcs_d1 = 0 
       if lcs_candidate in hypernyms_1: 
        lcs_d1 = hypernyms_1[lcs_candidate] 
       lcs_d2 = 0 
       if lcs_candidate in hypernyms_2: 
        lcs_d2 = hypernyms_2[lcs_candidate] 
       lcs_dists.append(max([lcs_d1, lcs_d2])) 
      h_dist = max(lcs_dists) 
     else: 
      h_dist = 0 
    return ((math.exp(BETA * h_dist) - math.exp(-BETA * h_dist))/
     (math.exp(BETA * h_dist) + math.exp(-BETA * h_dist))) 

def word_similarity(word_1, word_2): 
    synset_pair = get_best_synset_pair(word_1, word_2) 
    return (length_dist(synset_pair[0], synset_pair[1]) * 
     hierarchy_dist(synset_pair[0], synset_pair[1])) 

######################### sentence similarity ########################## 

def most_similar_word(word, word_set): 

    max_sim = -1.0 
    sim_word = "" 
    for ref_word in word_set: 
     sim = word_similarity(word, ref_word) 
     if sim > max_sim: 
      max_sim = sim 
      sim_word = ref_word 
    return sim_word, max_sim 

def info_content(lookup_word): 

    global N 
    if N == 0: 
     # poor man's lazy evaluation 
     for sent in brown.sents(): 
      for word in sent: 
       word = word.lower() 
       if not word in brown_freqs: 
        brown_freqs[word] = 0 
       brown_freqs[word] = brown_freqs[word] + 1 
       N = N + 1 
    lookup_word = lookup_word.lower() 
    n = 0 if not lookup_word in brown_freqs else brown_freqs[lookup_word] 
    return 1.0 - (math.log(n + 1)/math.log(N + 1)) 

def semantic_vector(words, joint_words, info_content_norm): 

    sent_set = set(words) 
    semvec = np.zeros(len(joint_words)) 
    i = 0 
    for joint_word in joint_words: 
     if joint_word in sent_set: 
      # if word in union exists in the sentence, s(i) = 1 (unnormalized) 
      semvec[i] = 1.0 
      if info_content_norm: 
       semvec[i] = semvec[i] * math.pow(info_content(joint_word), 2) 
     else: 
      # find the most similar word in the joint set and set the sim value 
      sim_word, max_sim = most_similar_word(joint_word, sent_set) 
      semvec[i] = PHI if max_sim > PHI else 0.0 
      if info_content_norm: 
       semvec[i] = semvec[i] * info_content(joint_word) * info_content(sim_word) 
     i = i + 1 
    return semvec     

def semantic_similarity(sentence_1, sentence_2, info_content_norm): 

    words_1 = nltk.word_tokenize(sentence_1) 
    words_2 = nltk.word_tokenize(sentence_2) 
    joint_words = set(words_1).union(set(words_2)) 
    vec_1 = semantic_vector(words_1, joint_words, info_content_norm) 
    vec_2 = semantic_vector(words_2, joint_words, info_content_norm) 
    return np.dot(vec_1, vec_2.T)/(np.linalg.norm(vec_1) * np.linalg.norm(vec_2)) 

######################### word order similarity ########################## 

def word_order_vector(words, joint_words, windex): 

    wovec = np.zeros(len(joint_words)) 
    i = 0 
    wordset = set(words) 
    for joint_word in joint_words: 
     if joint_word in wordset: 
      # word in joint_words found in sentence, just populate the index 
      wovec[i] = windex[joint_word] 
     else: 
      # word not in joint_words, find most similar word and populate 
      # word_vector with the thresholded similarity 
      sim_word, max_sim = most_similar_word(joint_word, wordset) 
      if max_sim > ETA: 
       wovec[i] = windex[sim_word] 
      else: 
       wovec[i] = 0 
     i = i + 1 
    return wovec 

def word_order_similarity(sentence_1, sentence_2): 
    """ 
    Computes the word-order similarity between two sentences as the normalized 
    difference of word order between the two sentences. 
    """ 
    words_1 = nltk.word_tokenize(sentence_1) 
    words_2 = nltk.word_tokenize(sentence_2) 
    joint_words = list(set(words_1).union(set(words_2))) 
    windex = {x[1]: x[0] for x in enumerate(joint_words)} 
    r1 = word_order_vector(words_1, joint_words, windex) 
    r2 = word_order_vector(words_2, joint_words, windex) 
    return 1.0 - (np.linalg.norm(r1 - r2)/np.linalg.norm(r1 + r2)) 

######################### overall similarity ########################## 

def similarity(sentence_1, sentence_2, info_content_norm): 
    """ 
    Calculate the semantic similarity between two sentences. The last 
    parameter is True or False depending on whether information content 
    normalization is desired or not. 
    """ 
    return DELTA * semantic_similarity(sentence_1, sentence_2, info_content_norm) + \ 
     (1.0 - DELTA) * word_order_similarity(sentence_1, sentence_2) 

これは、私は、テキストファイルにコードを実行すると、私はエラーコードを取得し、代わりに文章間の類似度の値の数を取得する、私はナンを得るLOOPING PART

with open ("C:\\Users\\Lenovo2\\Desktop\\Test123.txt", "r") as sentence_file: 
# Initialize a list to hold the results 
    results = [] 

    # Loop until we hit the end of the file 
    while True: 
     # Read two lines 
     x = sentence_file.readline() 
     y = sentence_file.readline() 

     # Check if we've reached the end of the file, if so, we're done 
     if not y: 
      # Break out of the infinite loop 
      break 
     else: 
      # The .rstrip('\n') removes the newline character from each line 
      x = x.rstrip('\n') 
      y = y.rstrip('\n') 

      # Calculate your similarity value 
      similarity_value = similarity(x, y, True) 

      # Add the two lines and similarity value to the results list 
      results.append([x, y, similarity_value]) 

# Loop through the pairs in the results list and print them 
for pair in results: 
    print(pair) 

IS ;

Warning (from warnings module): 
    File "C:\Users\Lenovo2\Desktop\Semantic Analysis (1).py", line 191 
    return np.dot(vec_1, vec_2.T)/(np.linalg.norm(vec_1) * np.linalg.norm(vec_2)) 
RuntimeWarning: invalid value encountered in double_scalars 

以前のフォーラムでは、このエラーはおそらく私がゼロで割ってゼロベクトルを持つことを意味すると理解しました。私はあまりにもそこに居座り、限られたPythonの経験を持っているので、プログラムを簡単に修正する方法やあまり変更しない方法はわかりません。

答えて

0

私はあなたが空の文字列を渡していると思います。テキストに空白行がありますか?空の文字列をチェックした後でなければ改行を取り除かないので、改行のみを含む文字列は捕捉されません。

Windows上にあるように見えるので、 '\ r \ n'スタイルの改行もある可能性がありますので、rstripが期待どおりに動作しない可能性があります。

私は(もデバッグのための印刷を行います)、以下の変更を加えることをお勧めしたい:あなたは文章のペアごとの比較されないためのコードは、バグを持っているように見えること

# Loop until we hit the end of the file 
while True: 
    # Read two lines, removing trailing whitespace 
    x = sentence_file.readline().rstrip() 
    y = sentence_file.readline().rstrip() 

    # Check if we've reached the end of the file, if so, we're done 
    if not x or not y: 
     # Break out of the infinite loop 
     break 
    else: 
     print(x, y) 
     # Calculate your similarity value 
     similarity_value = similarity(x, y, True) 

     # Add the two lines and similarity value to the results list 
     results.append([x, y, similarity_value]) 

注意を。つまり、(a、b、c、d)の文章があれば、(a、b)と(c、d)を比較するだけですが、実際には(a、b)、 、(c、d)。

あなたはitertoolsライブラリを使用してビットを、これをクリーンアップすることができます。

from itertools import pairwise 

lines = open ("C:\\Users\\Lenovo2\\Desktop\\Test123.txt", "r") 
for a, b in pairwise(lines): 
    x = a.rstrip() 
    y = b.rstrip() 
    # ... rest unchanged 
関連する問題