I have a parser I'm trying to write and I've gone over multiple versions of it and I can't seem to keep the memory usage down. I'm trying to parse the wikipedia sql dumps and in this example take the page entries file and throw them all in one giant vector (13 million pages). I got this working before, but I sharded the file into smaller pieces and aggregated them from the filesystem. Anyway, below is my attempt to parse these files in one pass. I'm pulling out the lines "INSERT INTO ... (data),(data)..(data);"
. I'm letting the other lines just fail in the parser and not yield values. The insert lines average 1MB in size and 8k entries.
I've tried what feels like a thousand versions of this with deepseqs and outputting lists of page entries per line instead of the vector I have now and using lineC with a partial parser instead of the linesUnboundedC. I assume there must be some high level concept I'm missing that is keeping the memory usage exponential. The file is 5GB and I'm easily blowing through 16GB of memory, but I can't seem to nail down a lazy thunk memory leak anywhere. I would assume that this code would be able to store each line independently and then move on without the extra memory to the next line, but I can't figure out what is wrong. I had no issue counting the amount of entries or using a more complex monoid to get data from the file using the same parser.
Any help is much appreciated!
{-# LANGUAGE OverloadedStrings #-}
module Parser.Helper where
import Conduit
import Control.DeepSeq
import Control.Monad (void)
import Control.Monad.Primitive
import Data.Attoparsec.Text
import Data.Maybe (catMaybes)
import Data.Text (Text)
import qualified Data.Vector as V
import Data.Vector.Generic (Vector)
import System.IO (openFile,IOMode(..))
data Page = Page !Int !Text !Bool
deriving Show
instance NFData Page where
rnf (Page pid title isRedirect) = pid `seq` title `seq` isRedirect `seq` ()
parseFile
:: (Foldable t, Vector v e)
=> FilePath -> Parser (t e) -> IO [v e]
parseFile fp parser =
do
handle <- openFile fp ReadMode
runConduit $ sourceHandle handle
.| decodeUtf8LenientC
.| peekForeverE linesUnboundedC
.| vectors parser
.| sinkList
vectors
:: (Foldable t, Vector v e, MonadBase base m, Control.Monad.Primitive.PrimMonad base)
=> Parser (t e) -> ConduitM Text (v e) m ()
vectors parser =
do
vectorBuilderC (1024*1024)
(\f ->
peekForeverE $ do
ml <- await
case ml of
Nothing -> return ()
Just l ->
case parseOnly parser l of
Left _ -> return ()
Right v -> mapM_ f v
)
parsePageLine :: Parser (V.Vector Page)
parsePageLine =
do
string "INSERT INTO" *> skipWhile (/= '(')
V.fromList . catMaybes <$> sepBy' parsePageField (char ',') <* char ';'
parsePageField :: Parser (Maybe Page)
parsePageField =
do
void $ char '('
pid <- parseInt <* char ','
namespace <- parseInt <* char ','
title <- parseTextField <* char ','
_ <- skipField <* char ','
_ <- skipField <* char ','
redirect <- parseInt <* char ','
void $ sepBy' skipField (char ',')
void $ char ')'
let ret = case namespace == 0 of
True -> Just $ Page pid title (redirect == 1)
False -> Nothing
return $ force ret
parseTextField :: Parser Text
parseTextField = char '\'' *> scan False f <* char '\''
where
f :: Bool -> Char -> Maybe Bool
f False '\'' = Nothing
f False '\\' = Just True
f _ _ = Just False
skipField :: Parser ()
skipField = void $ scan False f
where
f :: Bool-> Char -> Maybe Bool
f False '\\' = Just True
f False ',' = Nothing
f False ')' = Nothing
f _ _ = Just False
parseInt :: Parser Int
parseInt = signed $ decimal