2016-04-11 7 views
1

こんにちは、私はこのエラーを取得しておくNLTK使用してJSONからストップワードの削除:トークン化と

D:\WinPython-32bit-2.7.10.3\python-2.7.10>python TweetTest.py Twitter.json 
Traceback (most recent call last): 
    File "TweetTest.py", line 60, in <module> 
    tweet = json.loads(line) 
    File "D:\WinPython-32bit-2.7.10.3\python-2.7.10\lib\json\__init__.py", line 338, in loads 
    return _default_decoder.decode(s) 
    File "D:\WinPython-32bit-2.7.10.3\python-2.7.10\lib\json\decoder.py", line 369, in decode 
    raise ValueError(errmsg("Extra data", s, end, len(s))) 
ValueError: Extra data: line 1 column 4488 - line 1 column 99678411 (char 4487 - 99678410) 

私が間違って何であるか見当がつかないを。私のコードは次の通りです:

import sys 
import json 
from collections import Counter 
import re 
from nltk.corpus import stopwords 
import string 

punctuation = list(string.punctuation) 
stop = stopwords.words('english') + punctuation + ['rt', 'via'] 

emoticons_str = r""" 
    (?: 
     [:=;] # Eyes 
     [oO\-]? # Nose (optional) 
     [D\)\]\(\]/\\OpP] # Mouth 
    )""" 

regex_str = [ 
    emoticons_str, 
    r'<[^>]+>', # HTML tags 
    r'(?:@[\w_]+)', # @-mentions 
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags 
    r'http[s]?://(?:[a-z]|[0-9]|[[email protected]&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs 

    r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers 
    r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and ' 
    r'(?:[\w_]+)', # other words 
    r'(?:\S)' # anything else 
] 

tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE) 
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE) 

def tokenize(s): 
    return tokens_re.findall(s) 

def preprocess(s, lowercase=False): 
    tokens = tokenize(s) 
    if lowercase: 
     tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens] 
    return tokens 


if __name__ == '__main__': 
    fname = sys.argv[1] 

    with open(fname, 'r') as f: 
     count_all = Counter() 
     for line in f: 
      tweet = json.loads(line) 
      tokens = preprocess(tweet['text']) 
      count_all.update(tokens) 
     print(count_all.most_common(5)) 

これは私のJSONファイルの最初の2つの出力です。私はTweetストリームリスナーを使ってツイートを集めました。

{"created_at":"Wed Apr 06 08:33:55 +0000 2016","id":717631408345333760,"id_str":"717631408345333760","text":"RT @whosharold: Hilary Clinton cannot be president pls she can't even hold her man down what makes ya think she gon hold the office down","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":472387071,"id_str":"472387071","name":"BigGucciK 2x","screen_name":"KaisonThatBoy","location":"Bridgeport, CT","url":null,"description":null,"protected":false,"verified":false,"followers_count":1608,"friends_count":1219,"listed_count":8,"favourites_count":1293,"statuses_count":64337,"created_at":"Mon Jan 23 22:07:27 +0000 2012","utc_offset":-10800,"time_zone":"Atlantic Time (Canada)","geo_enabled":false,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"131516","profile_background_image_url":"http:\/\/abs.twimg.com\/images\/themes\/theme14\/bg.gif","profile_background_image_url_https":"https:\/\/abs.twimg.com\/images\/themes\/theme14\/bg.gif","profile_background_tile":true,"profile_link_color":"009999","profile_sidebar_border_color":"EEEEEE","profile_sidebar_fill_color":"EFEFEF","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/709500377104818182\/4vMu066C_normal.jpg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/709500377104818182\/4vMu066C_normal.jpg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/472387071\/1457000395","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Wed Apr 06 03:16:15 +0000 2016","id":717551464575401984,"id_str":"717551464575401984","text":"Hilary Clinton cannot be president pls she can't even hold her man down what makes ya think she gon hold the office down","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":792436550,"id_str":"792436550","name":"sadboyz","screen_name":"whosharold","location":null,"url":null,"description":"platano maduro no vuelve a verde","protected":false,"verified":false,"followers_count":1285,"friends_count":979,"listed_count":11,"favourites_count":4877,"statuses_count":91425,"created_at":"Thu Aug 30 21:26:30 +0000 2012","utc_offset":-10800,"time_zone":"Atlantic Time (Canada)","geo_enabled":true,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/pbs.twimg.com\/profile_background_images\/773304539\/94dbc3d1558da7f1e3d2c6fffcb5d710.jpeg","profile_background_image_url_https":"https:\/\/pbs.twimg.com\/profile_background_images\/773304539\/94dbc3d1558da7f1e3d2c6fffcb5d710.jpeg","profile_background_tile":true,"profile_link_color":"0084B4","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/714669878012219392\/9HmilvPG_normal.jpg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/714669878012219392\/9HmilvPG_normal.jpg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/792436550\/1458855437","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"is_quote_status":false,"retweet_count":2,"favorite_count":7,"entities":{"hashtags":[],"urls":[],"user_mentions":[],"symbols":[]},"favorited":false,"retweeted":false,"filter_level":"low","lang":"en"},"is_quote_status":false,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"whosharold","name":"sadboyz","id":792436550,"id_str":"792436550","indices":[3,14]}],"symbols":[]},"favorited":false,"retweeted":false,"filter_level":"low","lang":"en","timestamp_ms":"1459931635353"} 
{"created_at":"Wed Apr 06 08:33:55 +0000 2016","id":717631409742020609,"id_str":"717631409742020609","text":"RT @WisegalGranny: HONY Just Destroyed Donald Trump\u2019s Dream Of Becoming President - https:\/\/t.co\/8GIDVa76bZ Oooo, that's gonna hurt! #Unite\u2026","source":"\u003ca href=\"https:\/\/roundteam.co\" rel=\"nofollow\"\u003eRoundTeam\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":2846552432,"id_str":"2846552432","name":"Glenn Silva","screen_name":"GlennSilva76","location":"hawaii","url":null,"description":"Christian, Constitutional Conservative, Pro 1A 2A and RF, It's Time To Unite And Take Our Country Back! #NeverTrump\r\n#UniteWithCruz #CruzCrew #CruzToVictory","protected":false,"verified":false,"followers_count":1981,"friends_count":2408,"listed_count":99,"favourites_count":1819,"statuses_count":38301,"created_at":"Wed Oct 08 07:34:50 +0000 2014","utc_offset":null,"time_zone":null,"geo_enabled":false,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_link_color":"0084B4","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/691834454868889601\/1gkIbY1C_normal.jpg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/691834454868889601\/1gkIbY1C_normal.jpg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/2846552432\/1453447926","default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Wed Apr 06 08:18:04 +0000 2016","id":717627418454966272,"id_str":"717627418454966272","text":"HONY Just Destroyed Donald Trump\u2019s Dream Of Becoming President - https:\/\/t.co\/8GIDVa76bZ Oooo, that's gonna hurt! #UniteWithCruz #NeverTrump","source":"\u003ca href=\"http:\/\/twitter.com\" rel=\"nofollow\"\u003eTwitter Web Client\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":4726275950,"id_str":"4726275950","name":"Wisegal1958","screen_name":"WisegalGranny","location":null,"url":null,"description":null,"protected":false,"verified":false,"followers_count":475,"friends_count":290,"listed_count":73,"favourites_count":8976,"statuses_count":10881,"created_at":"Fri Jan 08 02:36:28 +0000 2016","utc_offset":null,"time_zone":null,"geo_enabled":false,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"F5F8FA","profile_background_image_url":"","profile_background_image_url_https":"","profile_background_tile":false,"profile_link_color":"2B7BB9","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/715082668770242561\/ohjXvK85_normal.jpg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/715082668770242561\/ohjXvK85_normal.jpg","default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"is_quote_status":false,"retweet_count":1,"favorite_count":0,"entities":{"hashtags":[{"text":"UniteWithCruz","indices":[114,128]},{"text":"NeverTrump","indices":[129,140]}],"urls":[{"url":"https:\/\/t.co\/8GIDVa76bZ","expanded_url":"http:\/\/www.parhlo.com\/hony-just-destroyed-trumps-dream-of-becoming-president\/?track=twb","display_url":"parhlo.com\/hony-just-dest\u2026","indices":[65,88]}],"user_mentions":[],"symbols":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en"},"is_quote_status":false,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[{"text":"UniteWithCruz","indices":[133,140]},{"text":"NeverTrump","indices":[139,140]}],"urls":[{"url":"https:\/\/t.co\/8GIDVa76bZ","expanded_url":"http:\/\/www.parhlo.com\/hony-just-destroyed-trumps-dream-of-becoming-president\/?track=twb","display_url":"parhlo.com\/hony-just-dest\u2026","indices":[84,107]}],"user_mentions":[{"screen_name":"WisegalGranny","name":"Wisegal1958","id":4726275950,"id_str":"4726275950","indices":[3,17]}],"symbols":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en","timestamp_ms":"1459931635686"} 

私を助けてください。ありがとうございました。

答えて

0

私は同じエラーが1回ありました。 スクリプトは各行の読み込み時にJSONオブジェクトを読み込みますが、JSONオブジェクトが改行で区切られていない可能性があります。例えば

あなたのファイルは、ファイルが

json_oject1 json_oject2 

が含まれている場合はエラーになります一方、2つのオブジェクトが読み込まれます

json_oject1 
json_oject2 

が含まれている場合。

解決方法:新しいJSONオブジェクトを出力ファイルに書き込むときに改行を追加してください。

(関連:https://stackoverflow.com/a/21058946/2314737

+0

申し訳ありませんが、私は本当にあなたが何を意味するのか理解していません。どうしたらいいですか?ありがとう – windboy

+0

JSONファイルの最初の2行を表示できますか?あなたの質問には1つだけあります。 – user2314737

+0

こんにちはjsonの部分を編集しました。 – windboy

関連する問題