2011-07-13 16 views
4

1つの文字列(英語テキストの文章)を読み込んで、「珍しい」単語(カンマ区切り)の別の文字列を出力するJavaScriptコードを作成しようとしています。 。文字列内の一般的な単語をフィルタリングするJavaScriptコード

var sentence="The dog ran to the other side of the field."; 

    var common_words="the, it is, we all, a, an, by, to, you, me, he, she, they, we, how, it, i, are, to, for, of"; 

--Some JavaScriptが私はこれを行うことができますどのように

var uncommon_words="dog, ran, other, side, field"; 

をcode--:ような何か?

答えて

8

ここに行く:

function getUncommon(sentence, common) { 
    var wordArr = sentence.match(/\w+/g), 
     commonObj = {}, 
     uncommonArr = [], 
     word, i; 

    common = common.split(','); 
    for (i = 0; i < common.length; i++) { 
     commonObj[ common[i].trim() ] = true; 
    } 

    for (i = 0; i < wordArr.length; i++) { 
     word = wordArr[i].trim().toLowerCase(); 
     if (!commonObj[word]) { 
      uncommonArr.push(word); 
     } 
    } 

    return uncommonArr; 
} 

ライブデモを:

var sentence_arr = sentence.split(/(?=\w)\b|\W/); 
var common_arr = common_words.split(', '); 

var uncommon_arr = array(); 
for(var i = 0; i < sentence_arr.length; i++) { 
    for (var j = 0; j < common_arr.length; j++) { 
     if (sentence_arr[i].toLowerCase() != common_arr[j].toLowerCase()) { 
      uncommon_arr.push(sentence_arr[i].toLowerCase()); 
    } 
} 

var uncommon_words = uncommon_arr.join(', '); 

完全にテストされていないが、ポイントはあなたが両方の文章を分割されます。ここではhttp://jsfiddle.net/simevidas/knXkS/

0

一般的な単語の連想配列を作成してから、それをトークン化してその中に含まれていない単語を出力します。 など。

var excluded = new Object(); 
common_words = common_words.split(","); 
for (var i in common_words) { 
    excluded[common_words[i].trim().toLowerCase()] = true; 
} 
var result = new Array(); 
var match = sentence.match(/\w+/g); 
for (var i in match) { 
    if (!excluded[match[i].toLowerCase()]) { 
     result.push(match[i]); 
    } 
} 
var uncommon_words = result.join(", "); 
+0

でなければなりません。また、除外された配列はブール値を必要とせず、キーと値のペアを与えるときにObjectに変換する必要があります。さらに、ループをループするたびにsplitを呼び出すのではないですか? – tomfumb

+0

スペースが単語の一部であるようには見えません。オブジェクト対配列を修正します。 Splitはループ内で呼び出されていないため、ループを定義します。 – Max

+0

他に誰かがアイデアを持っていますか? –

2

これはいかがですか?

sentence.replace(/\b(?:the|it is|we all|an?|by|to|you|[mh]e|she|they|we...)\b/ig, ''); 

これは、あなたの文章からすべての一般的な単語を削除する必要があります。残りの文字列を必要な方法で分割します。

0

がスタートだ、私は数えますそのリストの各メンバーに対して各単語を個別にチェックします。素朴で完全な縮尺ではありませんが、このような小さな例では問題ありません。

0

String#diff関数は、相違点のリスト(珍しい用語)を返します。これらの用語は、配列または文字列として提供できます。

あなたはそれをsentence.diff(terms)と呼びます。以下は、ユニットテストです:

以下
var sentence = 'The dog ran to the other side of the field.'; 
var terms = 'the, it is, we all, a, an, by, to, you, me, he, she, they, we, how, it, i, are, to, for, of'; 
// NOTE: The "terms" variable could also be an array. 

(sentence.diff(terms).toString() === 'dog,ran,other,side,field') 
    ? console.log('pass') 
    : console.log('fail'); 

は 'String.diff' 関数の定義である:

String.prototype.diff = function(terms){ 
    if (!terms) { 
    return []; 
    } 

    if (typeof terms === 'string') { 
    terms = terms.split(/,[\s]*/); 
    } 

    if (typeof terms !== 'object' || !Array.isArray(terms)) { 
    return []; 
    } 

    terms = terms.map(function(term){ 
    return term.toLowerCase(); 
    }); 

    var words = this.split(/[\W]/).filter(function(word){ 
    return word.length; 
    }); 

    return words.filter(function(word){ 
    return terms.indexOf(word.toLowerCase()) < 0; 
    }); 
}; 
5

削除したい単語がストップワードの魔女と呼ばれている:

["a", "able", "about", "across", "after", "all", "almost", "also", "am", "among", "an", "and", "any", "are", "as", "at", "be", "because", "been", "but", "by", "can", "cannot", "could", "dear", "did", "do", "does", "either", "else", "ever", "every", "for", "from", "get", "got", "had", "has", "have", "he", "her", "hers", "him", "his", "how", "however", "i", "if", "in", "into", "is", "it", "its", "just", "least", "let", "like", "likely", "may", "me", "might", "most", "must", "my", "neither", "no", "nor", "not", "of", "off", "often", "on", "only", "or", "other", "our", "own", "rather", "said", "say", "says", "she", "should", "since", "so", "some", "than", "that", "the", "their", "them", "then", "there", "these", "they", "this", "tis", "to", "too", "twas", "us", "wants", "was", "we", "were", "what", "when", "where", "which", "while", "who", "whom", "why", "will", "with", "would", "yet", "you", "your", "ain't", "aren't", "can't", "could've", "couldn't", "didn't", "doesn't", "don't", "hasn't", "he'd", "he'll", "he's", "how'd", "how'll", "how's", "i'd", "i'll", "i'm", "i've", "isn't", "it's", "might've", "mightn't", "must've", "mustn't", "shan't", "she'd", "she'll", "she's", "should've", "shouldn't", "that'll", "that's", "there's", "they'd", "they'll", "they're", "they've", "wasn't", "we'd", "we'll", "we're", "weren't", "what'd", "what's", "when'd", "when'll", "when's", "where'd", "where'll", "where's", "who'd", "who'll", "who's", "why'd", "why'll", "why's", "won't", "would've", "wouldn't", "you'd", "you'll", "you're", "you've"] 

ここにソースがあります。 http://99webtools.com/list-english-stop-words.php

ので、あなたのコードは、これはあなたが「」にあなたの順序を分割しているとして、「それは」または「我々はすべて」キャッチするつもりはない

function getNoneStopWords(sentence) { 
     var common = getStopWords(); 
     var wordArr = sentence.match(/\w+/g), 
      commonObj = {}, 
      uncommonArr = [], 
      word, i; 

     for (i = 0; i < common.length; i++) { 
      commonObj[ common[i].trim() ] = true; 
     } 

     for (i = 0; i < wordArr.length; i++) { 
      word = wordArr[i].trim().toLowerCase(); 
      if (!commonObj[word]) { 
       uncommonArr.push(word); 
      } 
     } 
     return uncommonArr; 
    } 

    function getStopWords() { 
     return ["a", "able", "about", "across", "after", "all", "almost", "also", "am", "among", "an", "and", "any", "are", "as", "at", "be", "because", "been", "but", "by", "can", "cannot", "could", "dear", "did", "do", "does", "either", "else", "ever", "every", "for", "from", "get", "got", "had", "has", "have", "he", "her", "hers", "him", "his", "how", "however", "i", "if", "in", "into", "is", "it", "its", "just", "least", "let", "like", "likely", "may", "me", "might", "most", "must", "my", "neither", "no", "nor", "not", "of", "off", "often", "on", "only", "or", "other", "our", "own", "rather", "said", "say", "says", "she", "should", "since", "so", "some", "than", "that", "the", "their", "them", "then", "there", "these", "they", "this", "tis", "to", "too", "twas", "us", "wants", "was", "we", "were", "what", "when", "where", "which", "while", "who", "whom", "why", "will", "with", "would", "yet", "you", "your", "ain't", "aren't", "can't", "could've", "couldn't", "didn't", "doesn't", "don't", "hasn't", "he'd", "he'll", "he's", "how'd", "how'll", "how's", "i'd", "i'll", "i'm", "i've", "isn't", "it's", "might've", "mightn't", "must've", "mustn't", "shan't", "she'd", "she'll", "she's", "should've", "shouldn't", "that'll", "that's", "there's", "they'd", "they'll", "they're", "they've", "wasn't", "we'd", "we'll", "we're", "weren't", "what'd", "what's", "when'd", "when'll", "when's", "where'd", "where'll", "where's", "who'd", "who'll", "who's", "why'd", "why'll", "why's", "won't", "would've", "wouldn't", "you'd", "you'll", "you're", "you've"]; 
    } 
関連する問題