2017-12-31 277 views
1

私は時間、緯度、経度、標高、速度のデータフレームを持っています。緯度経度ペアを滑らかにするための公差に基づいてデータセットを減らすために整形式を使用しています。正常に動作しますが、平滑化された単純化バージョンのデータポイント(lat、lon)をTime、Elevation、要素の元のデータフレームと一致させると、データポイントが500を超えると時間がかかりすぎます。データフレーム内の行を一致させ、一致しない行を削除するより速い方法は何ですか?

私がやっていることは、元のデータセットをループして一致するペアを見つけ、それらがすべて一致するまでインデックスを記録することです。私はポイントがほぼ常に連続しており、最初から再検索する理由がないので、検索を少しスピードアップするために "last_find"変数を使用しています。 FWIWでは、データの連続した線の性質とスムージング方法に基づいて意味をなされる、テストデータセットの完全なデータフレームスキャンに戻る必要があることは決して見たことがありません。

 lon = pd.Series(pd.Series(simplified_line.coords.xy)[1]) 
     lat = pd.Series(pd.Series(simplified_line.coords.xy)[0]) 

     si = pd.DataFrame({'Longitude': lon, 'Latitude': lat}) 
     si.tail() 

     si['df_index'] = None 
     pd.options.mode.chained_assignment = None # default='warn', suppress warning during copying dataframe 
     last_find = 0 # assume data is sequential and and start search at last point found to reduce iterations 
     for si_i, si_row in si.iterrows(): 
      si_coords = (si_row['Latitude'], si_row['Longitude']) 
      found = False 
      for df_i, df_row in islice(track.iterrows(), last_find, None): 
       if si_coords == (df_row['Latitude'], df_row['Longitude']): 
        si['df_index'][si_i] = df_i 
        last_find = df_i 
        found = True 
        break 
      if not found: 
       last_find = 0 
       # Rescanning full dataset for match 
       for df_i, df_row in islice(track.iterrows(), last_find, None): 
        if si_coords == (df_row['Latitude'], df_row['Longitude']): 
         si['df_index'][si_i] = df_i 
         last_find = df_i 
         break 

     rs = track.loc[si['df_index'].dropna()] 

データフレームを "rs"に再構築するこのプロセスは非常に遅いです。 (わずか500ポイントで22秒)。このタイプのマッチングを元のデータフレームサイズを減らすために行うより良い方法はありますか?最も難しい部分は、あなたが求めていたものを理解して

import pandas as pd 
from pandas import DataFrame 
from shapely.geometry import LineString 
from time import time 
from itertools import islice 
import datetime 


class RDP: 

    def __init__(self, tracks, tolerance=0.000002): 

     self.df = tracks 
     self.tolerance = tolerance 
     return 

    def smooth(self): 
     """ 
     Smooths list of data frames 
     :return: list of smoothed data frames 
     """ 

     results = [] 
     start_time = time() 
     for track in self.df: 

      coordinates = track.as_matrix(columns=['Latitude', 'Longitude']) 
      line = LineString(coordinates) 
      # If preserve topology is set to False, the method will use the Ramer-Douglas-Peucker algorithm 
      simplified_line = line.simplify(self.tolerance, preserve_topology=False) 

      lon = pd.Series(pd.Series(simplified_line.coords.xy)[1]) 
      lat = pd.Series(pd.Series(simplified_line.coords.xy)[0]) 

      si = pd.DataFrame({'Longitude': lon, 'Latitude': lat}) 
      si.tail() 

      si['df_index'] = None 
      pd.options.mode.chained_assignment = None # default='warn', suppress warning during copying dataframe 
      last_find = 0 # assume data is sequential and and start search at last point found to reduce iterations 
      for si_i, si_row in si.iterrows(): 
       si_coords = (si_row['Latitude'], si_row['Longitude']) 
       found = False 
       for df_i, df_row in islice(track.iterrows(), last_find, None): 
        if si_coords == (df_row['Latitude'], df_row['Longitude']): 
         si['df_index'][si_i] = df_i 
         last_find = df_i 
         found = True 
         break 
       if not found: 
        last_find = 0 
        # Rescanning full dataset for match 
        for df_i, df_row in islice(track.iterrows(), last_find, None): 
         if si_coords == (df_row['Latitude'], df_row['Longitude']): 
          si['df_index'][si_i] = df_i 
          last_find = df_i 
          break 

      rs = track.loc[si['df_index'].dropna()] 
      results.append(rs) 
      print('process took %s seconds' % round(time() - start_time, 2)) 
     return results 


if __name__ == "__main__": 
    data = [[-155.05156, 19.73201, 23.0, datetime.datetime(2017, 12, 28, 17, 50, 9), None, 0], 
      [-155.05156, 19.73201, 23.0, datetime.datetime(2017, 12, 28, 17, 50, 10), 0.0, 0.0], 
      [-155.05156, 19.73201, 23.0, datetime.datetime(2017, 12, 28, 17, 50, 11), 1.8244950963755258, 0.0], 
      [-155.05157, 19.73202, 22.0, datetime.datetime(2017, 12, 28, 17, 50, 12), 1.4678475295952227, 
      1.527543187532957], 
      [-155.05157, 19.73203, 22.0, datetime.datetime(2017, 12, 28, 17, 50, 13), 1.11120000035271, 
      1.1122983328025196], 
      [-155.05157, 19.73203, 22.0, datetime.datetime(2017, 12, 28, 17, 50, 14), 2.3687194876712123, 0.0], 
      [-155.05159, 19.73204, 22.0, datetime.datetime(2017, 12, 28, 17, 50, 15), 1.7399596859879076, 
      2.3710607190787623], 
      [-155.05159, 19.73205, 22.0, datetime.datetime(2017, 12, 28, 17, 50, 16), 1.7399596281612155, 
      1.112298332448747], 
      [-155.05161, 19.73206, 22.0, datetime.datetime(2017, 12, 28, 17, 50, 17), 1.7399595703344959, 
      2.3710604875433656], 
      [-155.05161, 19.73207, 22.0, datetime.datetime(2017, 12, 28, 17, 50, 18), 1.7399595111950648, 
      1.112298332448747], 
      [-155.05163, 19.73208, 22.0, datetime.datetime(2017, 12, 28, 17, 50, 19), 2.096606870194645, 
      2.3710602536550747], 
      [-155.05164, 19.73209, 23.0, datetime.datetime(2017, 12, 28, 17, 50, 20), 1.6752646424182498, 
      1.527542875149723], 
      [-155.05165, 19.7321, 23.0, datetime.datetime(2017, 12, 28, 17, 50, 21), 2.289051665826317, 
      1.5275428299682154], 
      [-155.05167, 19.73212, 23.0, datetime.datetime(2017, 12, 28, 17, 50, 22), 2.754150510219822, 
      3.055085523596321], 
      [-155.05168, 19.73214, 23.0, datetime.datetime(2017, 12, 28, 17, 50, 23), 2.4562322562443732, 
      2.458660072750598], 
      [-155.05169, 19.73216, 23.0, datetime.datetime(2017, 12, 28, 17, 50, 24), 2.4124750922364004, 
      2.458660017743196], 
      [-155.05171, 19.73217, 23.0, datetime.datetime(2017, 12, 28, 17, 50, 25), 2.9312779133573135, 
      2.3710592140947706], 
      [-155.05172, 19.7322, 23.0, datetime.datetime(2017, 12, 28, 17, 50, 26), 2.9312777104723176, 
      3.497291307909982], 
      [-155.05174, 19.73221, 23.0, datetime.datetime(2017, 12, 28, 17, 50, 27), 2.7103926533029608, 
      2.3710587533735854], 
      [-155.05176, 19.73223, 23.0, datetime.datetime(2017, 12, 28, 17, 50, 28), 2.7541498631496495, 
      3.0550845355246805], 
      [-155.05177, 19.73225, 23.0, datetime.datetime(2017, 12, 28, 17, 50, 29), 3.214644630731547, 
      2.4586597654103666], 
      [-155.0518, 19.73227, 22.0, datetime.datetime(2017, 12, 28, 17, 50, 30), 3.272104136727811, 
      3.8489512292988133], 
      [-155.05182, 19.73228, 23.0, datetime.datetime(2017, 12, 28, 17, 50, 31), 2.4699338807004922, 
      2.3710579406395524], 
      [-155.05184, 19.73229, 23.0, datetime.datetime(2017, 12, 28, 17, 50, 32), 2.710391831502253, 
      2.3710578250365275], 
      [-155.05186, 19.73231, 23.0, datetime.datetime(2017, 12, 28, 17, 50, 33), 2.571984173054396, 
      3.055083816479077], 
      [-155.05188, 19.73231, 23.0, datetime.datetime(2017, 12, 28, 17, 50, 34), 2.2303087951040954, 
      2.0939690200246193], 
      [-155.0519, 19.73232, 23.0, datetime.datetime(2017, 12, 28, 17, 50, 35), 2.412473942713288, 
      2.371057475376574], 
      [-155.05191, 19.73234, 23.0, datetime.datetime(2017, 12, 28, 17, 50, 36), 2.892513413572397, 
      2.458659515345401], 
      [-155.05194, 19.73235, 23.0, datetime.datetime(2017, 12, 28, 17, 50, 37), 3.1904307936770424, 
      3.3320852845152014], 
      [-155.05196, 19.73237, 23.0, datetime.datetime(2017, 12, 28, 17, 50, 38), 3.190430517344615, 
      3.0550832771287606], 
      [-155.05199, 19.73238, 23.0, datetime.datetime(2017, 12, 28, 17, 50, 39), 3.1904303798788662, 
      3.332084723430405], 
      [-155.05201, 19.7324, 23.0, datetime.datetime(2017, 12, 28, 17, 50, 40), 2.710390701811524, 
      3.055083009665372], 
      [-155.05203, 19.73241, 23.0, datetime.datetime(2017, 12, 28, 17, 50, 41), 2.3687150928988454, 
      2.3710564358044426], 
      [-155.05205, 19.73242, 23.0, datetime.datetime(2017, 12, 28, 17, 50, 42), 2.4699323240615967, 
      2.371056320034746], 
      [-155.05207, 19.73243, 22.0, datetime.datetime(2017, 12, 28, 17, 50, 43), 2.4699322129590935, 
      2.371056201746366], 
      [-155.05209, 19.73244, 22.0, datetime.datetime(2017, 12, 28, 17, 50, 44), 3.1708845562819272, 
      2.3710560884951897], 
      [-155.05212, 19.73246, 23.0, datetime.datetime(2017, 12, 28, 17, 50, 45), 3.1708842854814536, 
      3.8489481826678027], 
      [-155.05214, 19.73247, 23.0, datetime.datetime(2017, 12, 28, 17, 50, 46), 2.710389981744121, 
      2.371055741019484], 
      [-155.05216, 19.73249, 23.0, datetime.datetime(2017, 12, 28, 17, 50, 47), 2.710389776420281, 
      3.0550821973706315], 
      [-155.05218, 19.7325, 23.0, datetime.datetime(2017, 12, 28, 17, 50, 48), 2.710389674730764, 
      2.3710553940411665], 
      [-155.0522, 19.73252, 23.0, datetime.datetime(2017, 12, 28, 17, 50, 49), 3.0520651178491267, 
      3.055081929390566], 
      [-155.05222, 19.73254, 23.0, datetime.datetime(2017, 12, 28, 17, 50, 50), 2.7103892638045273, 
      3.0550817499490517], 
      [-155.05224, 19.73255, 23.0, datetime.datetime(2017, 12, 28, 17, 50, 51), 2.710389160077225, 
      2.371054817376382], 
      [-155.05226, 19.73257, 23.0, datetime.datetime(2017, 12, 28, 17, 50, 52), 2.7103889546600244, 
      3.0550814785741593], 
      [-155.05228, 19.73258, 23.0, datetime.datetime(2017, 12, 28, 17, 50, 53), 3.1069293021008604, 
      2.3710544675462026], 
      [-155.05231, 19.7326, 23.0, datetime.datetime(2017, 12, 28, 17, 50, 54), 2.6855886784059737, 
      3.8489459341931775], 
      [-155.05232, 19.73261, 23.0, datetime.datetime(2017, 12, 28, 17, 50, 55), 2.2890482314811806, 
      1.5275405390797137], 
      [-155.05234, 19.73263, 23.0, datetime.datetime(2017, 12, 28, 17, 50, 56), 3.0520641294586754, 
      3.055080939475094], 
      [-155.05236, 19.73265, 23.0, datetime.datetime(2017, 12, 28, 17, 50, 57), 3.052063950954796, 
      3.0550807613433832], 
      [-155.05238, 19.73267, 23.0, datetime.datetime(2017, 12, 28, 17, 50, 58), 3.131887830893072, 
      3.0550805819008096], 
      [-155.0524, 19.73269, 22.0, datetime.datetime(2017, 12, 28, 17, 50, 59), 2.79021178709455, 
      3.0550804024580738], 
      [-155.05242, 19.7327, 22.0, datetime.datetime(2017, 12, 28, 17, 51), 2.710387619443569, 2.371053080813694], 
      [-155.05244, 19.73272, 22.0, datetime.datetime(2017, 12, 28, 17, 51, 1), 2.7103874140249378, 
      3.0550801308237463], 
      [-155.05246, 19.73273, 22.0, datetime.datetime(2017, 12, 28, 17, 51, 2), 2.790211385044462, 
      2.3710527309810545], 
      [-155.05248, 19.73275, 23.0, datetime.datetime(2017, 12, 28, 17, 51, 3), 2.7902111840874317, 
      3.0550798630984697], 
      [-155.0525, 19.73276, 23.0, datetime.datetime(2017, 12, 28, 17, 51, 4), 2.7103870033308333, 
      2.3710523836665467], 
      [-155.05252, 19.73278, 23.0, datetime.datetime(2017, 12, 28, 17, 51, 5), 3.4486025267361144, 
      3.0550795919786187], 
      [-155.05255, 19.7328, 23.0, datetime.datetime(2017, 12, 28, 17, 51, 6), 2.685586626408571, 
      3.848942726844205], 
      [-155.05256, 19.73281, 23.0, datetime.datetime(2017, 12, 28, 17, 51, 7), 2.3688709766335045, 
      1.5275396387506783], 
      [-155.05258, 19.73283, 22.0, datetime.datetime(2017, 12, 28, 17, 51, 8), 2.790210380257448, 
      3.055079144293523], 
      [-155.0526, 19.73284, 22.0, datetime.datetime(2017, 12, 28, 17, 51, 9), 2.7902102788103615, 
      2.37105145984478], 
      [-155.05262, 19.73286, 23.0, datetime.datetime(2017, 12, 28, 17, 51, 10), 3.211710268550555, 
      3.0550788729150917], 
      [-155.05264, 19.73288, 22.0, datetime.datetime(2017, 12, 28, 17, 51, 11), 3.1318859927435865, 
      3.0550786952967113], 
      [-155.05266, 19.7329, 22.0, datetime.datetime(2017, 12, 28, 17, 51, 12), 2.71038556626744, 
      3.0550785153370557], 
      [-155.05268, 19.73291, 22.0, datetime.datetime(2017, 12, 28, 17, 51, 13), 2.3687093086562108, 
      2.3710506470866193], 
      [-155.0527, 19.73292, 22.0, datetime.datetime(2017, 12, 28, 17, 51, 14), 2.469926768769237, 
      2.3710505291274866], 
      [-155.05272, 19.73293, 23.0, datetime.datetime(2017, 12, 28, 17, 51, 15), 3.272095501921749, 
      2.3710504155416987], 
      [-155.05275, 19.73295, 22.0, datetime.datetime(2017, 12, 28, 17, 51, 16), 3.170877650790823, 
      3.8489403198357444], 
      [-155.05277, 19.73296, 22.0, datetime.datetime(2017, 12, 28, 17, 51, 17), 2.3687087304575547, 
      2.371050068389868], 
      [-155.05279, 19.73297, 22.0, datetime.datetime(2017, 12, 28, 17, 51, 18), 2.3687086147991403, 
      2.3710499502645015], 
      [-155.05281, 19.73298, 22.0, datetime.datetime(2017, 12, 28, 17, 51, 19), 2.790208873068198, 
      2.3710498368443993], 
      [-155.05283, 19.733, 23.0, datetime.datetime(2017, 12, 28, 17, 51, 20), 2.7902086721088404, 
      3.0550776168243536], 
      [-155.05285, 19.73301, 23.0, datetime.datetime(2017, 12, 28, 17, 51, 21), 2.3687081533850103, 
      2.3710494917124847], 
      [-155.05287, 19.73302, 23.0, datetime.datetime(2017, 12, 28, 17, 51, 22), 2.368708036413647, 
      2.3710493735868456], 
      [-155.05289, 19.73303, 23.0, datetime.datetime(2017, 12, 28, 17, 51, 23), 2.71038422971785, 
      2.3710492556271126], 
      [-155.05291, 19.73305, 23.0, datetime.datetime(2017, 12, 28, 17, 51, 24), 2.7541450397296856, 
      3.055077167566475], 
      [-155.05292, 19.73307, 23.0, datetime.datetime(2017, 12, 28, 17, 51, 25), 2.412468561327511, 
      2.4586574769885563], 
      [-155.05294, 19.73308, 23.0, datetime.datetime(2017, 12, 28, 17, 51, 26), 2.368707343773043, 
      2.371048681300471], 
      [-155.05296, 19.73309, 23.0, datetime.datetime(2017, 12, 28, 17, 51, 27), 2.7103836137458464, 
      2.3710485633404104], 
      [-155.05298, 19.73311, 23.0, datetime.datetime(2017, 12, 28, 17, 51, 28), 2.7103834083234952, 
      3.0550766266297775], 
      [-155.053, 19.73312, 23.0, datetime.datetime(2017, 12, 28, 17, 51, 29), 2.710383306631016, 
      2.3710482158540307], 
      [-155.05302, 19.73314, 23.0, datetime.datetime(2017, 12, 28, 17, 51, 30), 3.190419933693602, 
      3.0550763588997274], 
      [-155.05305, 19.73315, 23.0, datetime.datetime(2017, 12, 28, 17, 51, 31), 2.8487433306642, 
      3.332070455791797], 
      [-155.05307, 19.73316, 23.0, datetime.datetime(2017, 12, 28, 17, 51, 32), 2.710382894565474, 
      2.371047750738638], 
      [-155.05309, 19.73318, 23.0, datetime.datetime(2017, 12, 28, 17, 51, 33), 2.811600384514776, 
      3.055075999491225], 
      [-155.05311, 19.73319, 22.0, datetime.datetime(2017, 12, 28, 17, 51, 34), 2.5711414689310477, 
      2.3710474054378037], 
      [-155.05313, 19.7332, 23.0, datetime.datetime(2017, 12, 28, 17, 51, 35), 2.811600187179107, 
      2.371047289829799], 
      [-155.05315, 19.73322, 23.0, datetime.datetime(2017, 12, 28, 17, 51, 36), 3.190418832907143, 
      3.055075638256173], 
      [-155.05318, 19.73323, 23.0, datetime.datetime(2017, 12, 28, 17, 51, 37), 2.9903853616785745, 
      3.3320689728999744], 
      [-155.05319, 19.73325, 22.0, datetime.datetime(2017, 12, 28, 17, 51, 38), 2.5103487005418743, 
      2.4586569745708617], 
      [-155.05321, 19.73326, 22.0, datetime.datetime(2017, 12, 28, 17, 51, 39), 2.710381868668547, 
      2.3710465975375103], 
      [-155.05323, 19.73328, 22.0, datetime.datetime(2017, 12, 28, 17, 51, 40), 2.7103816619319017, 
      3.0550751009671164], 
      [-155.05325, 19.73329, 22.0, datetime.datetime(2017, 12, 28, 17, 51, 41), 2.3687049136083917, 
      2.3710462453430305], 
      [-155.05327, 19.7333, 22.0, datetime.datetime(2017, 12, 28, 17, 51, 42), 2.3687047992608443, 
      2.371046131921171], 
      [-155.05329, 19.73331, 22.0, datetime.datetime(2017, 12, 28, 17, 51, 43), 2.710381355108257, 
      2.3710460163125644], 
      [-155.05331, 19.73333, 22.0, datetime.datetime(2017, 12, 28, 17, 51, 44), 2.7103811483711353, 
      3.0550746517035354], 
      [-155.05333, 19.73334, 22.0, datetime.datetime(2017, 12, 28, 17, 51, 45), 2.368704335399346, 
      2.371045666469919], 
      [-155.05335, 19.73335, 22.0, datetime.datetime(2017, 12, 28, 17, 51, 46), 3.1069186773136934, 
      2.3710455533797066], 
      [-155.05338, 19.73337, 22.0, datetime.datetime(2017, 12, 28, 17, 51, 47), 2.685580778436882, 
      3.8489335801226137], 
      [-155.05339, 19.73338, 22.0, datetime.datetime(2017, 12, 28, 17, 51, 48), 2.427402283410334, 
      1.5275370795403593]] 
    columns = ['Longitude', 'Latitude', 'Altitude', 'Time', 'Speed', 
       'Distance'] 
    df = list() 
    df.append(DataFrame(data, columns=columns)) 
    rdp = RDP(df) 
    print(rdp.smooth()) 
+1

[mcve] _really_ help ... –

+0

@cᴏʟᴅsᴘᴇᴇᴅokですが、ちょっとぎっしりしています – user6972

+0

あなたの実際のデータセットである必要はありません...おもちゃの作品も...しかし、それは小さくて最小限のものでなければなりません。このガイドラインをご覧ください:https://stackoverflow.com/a/20159305/4909087 –

答えて

1

はここで検査のための完全な例です。これは、最初のforループから始まるすべてのコードに相当します。

rs = si.merge(track, on = ["Latitude", "Longitude"]) 

基本的には、2つの列に基づいて2つのデータフレームをマージするだけです。このマージはデフォルトで内部マージに設定されています。内部マージは、マッチが両方で見つかった行のみを保持します。

+0

私は理解して、私は何を尋ねるのか分からなかった。これはデータフレームを使った初めてのことですが、私が理解できる唯一のことは、それを反復することだけでした。ありがとう、この方法は100倍高速です。 – user6972

+1

問題ありません。 https://pandas.pydata.org/pandas-docs/stable/merging.htmlを読むことを強くお勧めします。 –

関連する問題