2017-06-20 17 views
5

私はpipe-attoparsecパーサーとストレージをさらに最適化しようとしていますが、 GHC 8.0.2(stack ghc -- -O2 -rtsopts -threaded -Wall account-parser.hsHaskell、pipe、attoparsec、およびcontainersでのメモリの最適化

でコンパイル

考えると、アカウント-parser.hs

{-# LANGUAGE RankNTypes #-} 
{-# LANGUAGE OverloadedStrings #-} 
{-# LANGUAGE NoImplicitPrelude #-} 

import Protolude hiding (for) 

import Data.Hashable 
import Data.IntMap.Strict (IntMap) 
import Data.Vector (Vector) 
import Pipes 
import Pipes.Parse 
import Pipes.Safe (MonadSafe, runSafeT) 
import qualified Data.Attoparsec.ByteString.Char8 as AB 
import qualified Data.IntMap.Strict as IM 
import qualified Data.Vector as Vector 
import qualified Pipes.Attoparsec as PA 
import qualified Pipes.ByteString as PB 
import qualified Pipes.Safe.Prelude as PSP 

-- accountid|account-name|contractid|code 

data AccountLine = AccountLine { 
    _accountId   :: !ByteString, 
    _accountName  :: !ByteString, 
    _accountContractId :: !ByteString, 
    _accountCode  :: !Word32 
    } deriving (Show) 

type MapCodetoAccountIdIdx = IntMap Int 

data Accounts = Accounts { 
    _accountIds :: !(Vector ByteString), 
    _cache  :: !(IntMap Int), 
    _accountCodes :: !MapCodetoAccountIdIdx 
    } deriving (Show) 


parseAccountLine :: AB.Parser AccountLine 
parseAccountLine = AccountLine <$> 
    getSubfield <* delim <*> 
    getSubfield <* delim <*> 
    getSubfield <* delim <*> 
    AB.decimal <* AB.endOfLine 
    where getSubfield = AB.takeTill (== '|') 
      delim = AB.char '|' 

-- 

aempty :: Accounts 
aempty = Accounts Vector.empty IM.empty IM.empty 

aappend :: Accounts -> AccountLine -> Accounts 
aappend (Accounts ids a2i cps) (AccountLine aid an cid cp) = 
    case IM.lookup (hash aid) a2i of 
     Nothing -> Accounts 
       (Vector.snoc ids (toS aid)) 
       (IM.insert (hash aid) (length ids) a2i) 
       (IM.insert (fromIntegral cp) (length ids) cps) 
     Just idx -> Accounts ids a2i (IM.insert (fromIntegral cp) idx cps) 

foldAccounts :: (Monad m) => Parser AccountLine m Accounts 
foldAccounts = foldAll aappend aempty identity 

readByteStringFile :: (MonadSafe m) => FilePath -> Producer' ByteString m() 
readByteStringFile file = PSP.withFile file ReadMode PB.fromHandle 

accountLines :: Text -> MonadSafe m => Producer AccountLine m (Either (PA.ParsingError, Producer ByteString m())()) 
accountLines filename = PA.parsed parseAccountLine (readByteStringFile (toS filename)) 


main :: IO() 
main = do 
    [filename] <- getArgs 
    x <- runSafeT $ runEffect $ Pipes.Parse.evalStateT foldAccounts (accountLines (toS filename)) 

    print $ sizes x 

sizes :: Accounts -> (Int, Int, Int) 
sizes (Accounts aid xxx acp) = (Vector.length aid, IM.size xxx, IM.size acp) 

は、私がどんな下位のメモリ使用量を取得することはできません。私は速いルックアップをしなければならないので、IntMaps。ファイルは約20 MBです(効率的ではありません)。ほとんどのデータは5 MBに収まる必要があります。

$ ./account-parser /tmp/accounts +RTS -s 
(5837,5837,373998) 
    1,631,040,680 bytes allocated in the heap 
    221,765,464 bytes copied during GC 
     41,709,048 bytes maximum residency (13 sample(s)) 
     2,512,560 bytes maximum slop 
       82 MB total memory in use (0 MB lost due to fragmentation) 

            Tot time (elapsed) Avg pause Max pause 
    Gen 0  2754 colls,  0 par 0.105s 0.142s  0.0001s 0.0002s 
    Gen 1  13 colls,  0 par 0.066s 0.074s  0.0057s 0.0216s 

    TASKS: 4 (1 bound, 3 peak workers (3 total), using -N1) 

    SPARKS: 0 (0 converted, 0 overflowed, 0 dud, 0 GC'd, 0 fizzled) 

    INIT time 0.000s ( 0.001s elapsed) 
    MUT  time 0.324s ( 0.298s elapsed) 
    GC  time 0.171s ( 0.216s elapsed) 
    EXIT time 0.000s ( 0.005s elapsed) 
    Total time 0.495s ( 0.520s elapsed) 

    Alloc rate 5,026,660,297 bytes per MUT second 

    Productivity 65.5% of total user, 58.4% of total elapsed 

gc_alloc_block_sync: 0 
whitehole_spin: 0 
gen[0].sync: 0 
gen[1].sync: 0 

とプロファイル:

enter image description here

+0

を実行している私は、これでは専門家ではないので、塩の必要量を次のように取る:それは配列があなたのヒープの大半を占めているように見えます。入力ファイルにはいくつのユニークアカウントがありますか?新しいアカウントが見つかるたびに、 'Vector.snoc'は配列全体をコピーして、古いものをガベージにする必要があります。安価な 'add'(例えば、 '[]'、 'Seq'、または可変な可変長配列)のデータ構造にアカウントIDをロードしようとしましたか? –

+0

それをフォローアップすると、私はリストと 'fromListを使っていると思います。折り返した後に逆戻りすると助けになる。ベクトルは、効率的な短所やスヌークのために設計されていません。 –

+0

@BenjaminHodgson私は '[]'を試しましたが、メモリを少しずつ引き上げました( '-c'を使うかどうかによって10MBから20MBまで)。 –

答えて

0

I、

  • は、代わりに締固めにHashMap Text (Set Word32)
  • ターンを使用し、中間見上げるキャッシュ
  • を削除した場合+RTS -c

私は総メモリを34 MBにすることができますが、私のルックアップは今O(n)に行きます。これは私が得る最高の可能性が高いです。

{-# LANGUAGE RankNTypes #-} 
{-# LANGUAGE OverloadedStrings #-} 
{-# LANGUAGE NoImplicitPrelude #-} 

import   Protolude hiding (for) 

import qualified Data.Attoparsec.ByteString.Char8 as AB 
import   Data.HashMap.Strict (HashMap) 
import qualified Data.HashMap.Strict as HashMap 
import   Data.Set (Set) 
import qualified Data.Set as Set 
import   Pipes 
import qualified Pipes.Attoparsec as PA 
import qualified Pipes.ByteString as PB 
import   Pipes.Parse 
import   Pipes.Safe (MonadSafe, runSafeT) 
import qualified Pipes.Safe.Prelude as PSP 

-- accountid|account-name|contractid|code 

data AccountLine = AccountLine { 
    _accountId   :: !ByteString, 
    _accountName  :: !ByteString, 
    _accountContractId :: !ByteString, 
    _accountCode  :: !Word32 
    } deriving (Show) 


newtype Accounts = Accounts (HashMap Text (Set Word32)) 
       deriving (Show) 

parseAccountLine :: AB.Parser AccountLine 
parseAccountLine = AccountLine <$> 
    getSubfield <* delim <*> 
    getSubfield <* delim <*> 
    getSubfield <* delim <*> 
    AB.decimal <* AB.endOfLine 
    where getSubfield = AB.takeTill (== '|') 
      delim = AB.char '|' 

-- 

aempty :: Accounts 
aempty = Accounts HashMap.empty 

aappend :: Accounts -> AccountLine -> Accounts 
aappend (Accounts cps) (AccountLine aid an cid cp) = 
    case HashMap.lookup (toS aid) cps of 
     Nothing -> Accounts (HashMap.insert (toS aid) (Set.singleton cp) cps) 
     Just value -> Accounts (HashMap.update (\codes -> Just (Set.insert cp value)) (toS aid) cps) 

foldAccounts :: (Monad m) => Parser AccountLine m Accounts 
foldAccounts = foldAll aappend aempty identity 

readByteStringFile :: (MonadSafe m) => FilePath -> Producer' ByteString m() 
readByteStringFile file = PSP.withFile file ReadMode PB.fromHandle 

accountLines :: Text -> MonadSafe m => Producer AccountLine m (Either (PA.ParsingError, Producer ByteString m())()) 
accountLines filename = PA.parsed parseAccountLine (readByteStringFile (toS filename)) 


main :: IO() 
main = do 
    [filename] <- getArgs 
    x <- runSafeT $ runEffect $ Pipes.Parse.evalStateT foldAccounts (accountLines (toS filename)) 

    print $ sizes x 

    -- print x 
    print $ lookupAccountFromCode x 254741 
    print $ lookupAccountFromCode x 196939 


sizes :: Accounts -> Int 
sizes (Accounts acp) = HashMap.size acp 

lookupAccountFromCode :: Accounts -> Word32 -> Maybe Text 
lookupAccountFromCode (Accounts accts) cp = do 
    let f a k v = bool a (Just k) (Set.member cp v) 
    HashMap.foldlWithKey' f Nothing accts 

そして

$ ./account-parser /tmp/accounts +RTS -s -c 
5837 
Just "1-PCECJ5" 
Just "AANA-76KOUU" 
    1,652,177,904 bytes allocated in the heap 
     83,767,440 bytes copied during GC 
     17,563,800 bytes maximum residency (18 sample(s)) 
     751,144 bytes maximum slop 
       34 MB total memory in use (0 MB lost due to fragmentation) 

            Tot time (elapsed) Avg pause Max pause 
    Gen 0  3083 colls,  0 par 0.058s 0.069s  0.0000s 0.0002s 
    Gen 1  18 colls,  0 par 0.115s 0.151s  0.0084s 0.0317s 

    TASKS: 4 (1 bound, 3 peak workers (3 total), using -N1) 

    SPARKS: 0 (0 converted, 0 overflowed, 0 dud, 0 GC'd, 0 fizzled) 

    INIT time 0.000s ( 0.002s elapsed) 
    MUT  time 0.263s ( 0.289s elapsed) 
    GC  time 0.173s ( 0.219s elapsed) 
    EXIT time 0.009s ( 0.008s elapsed) 
    Total time 0.445s ( 0.518s elapsed) 

    Alloc rate 6,286,682,587 bytes per MUT second 

    Productivity 61.0% of total user, 57.4% of total elapsed 

gc_alloc_block_sync: 0 
whitehole_spin: 0 
gen[0].sync: 0 
gen[1].sync: 0 
関連する問題