module Main where
import Data.Char
import Data.List
import System.Environment
import System.FilePath
import Text.Parsec
import Text.Parsec.String
paraBeginSpace :: Int
paraBeginSpace = 2
main :: IO ()
main = do
args <- getArgs
let input = head args
output = combine (takeDirectory input) "formatted.txt"
str <- readFile input
case (runParser file () "" str) of
Left err -> do
putStr "parse error at "
print err
Right x -> writeFile output x
return ()
emptyLine :: Parser String
emptyLine = do
many (satisfy isSepSpace)
newline
return ""
pageLine :: Parser String
pageLine = do
many1 (satisfy isSepSpace)
many1 digit
many (satisfy isSepSpace)
newline
return ""
isSepSpace :: Char -> Bool
isSepSpace s = isSpace s && s /= '\n'
insertSpaceBetweenEnglishWords :: [String] -> [String]
insertSpaceBetweenEnglishWords [] = []
insertSpaceBetweenEnglishWords (x:[]) = [x]
insertSpaceBetweenEnglishWords (x:y:xs)
| isEnWord x,
isEnWord y = x : " " : insertSpaceBetweenEnglishWords (y:xs)
| otherwise = x : insertSpaceBetweenEnglishWords (y:xs)
where
isEnWord :: String -> Bool
isEnWord w = and $ map (\l -> isAscii l && isLetter l) w
run :: Show a => Parser a -> String -> IO ()
run p input
= case (parse p "" input) of
Left err -> do{ putStr "parse error at "
; print err
}
Right x -> print x
word :: Parser String
word = do
content <- many1 (satisfy (not.isSpace ) )
return content
beginLine :: Parser String
beginLine = do
begin <- try (count paraBeginSpace space) <?> "段首2个空格"
beginWord <- word <?> "段首2个空格"
many $ satisfy isSepSpace
lineContents <- word `sepBy` satisfy isSepSpace
newline
return $ concat $ insertSpaceBetweenEnglishWords
$ begin : beginWord : lineContents
contentLine :: Parser String
contentLine = do
lineContents <- word `sepEndBy1` many (satisfy isSepSpace)
newline
return $ concat $ insertSpaceBetweenEnglishWords lineContents
brokenLine :: Parser String
brokenLine = do
notParaBegin -- 1 or 3+ spaces
lineContents <- word `sepEndBy1`
many (satisfy isSepSpace)
<?> "非段落起首空白数"
newline
many (choice [try emptyLine, try pageLine])
lookAhead contentLine
return $ concat
$ insertSpaceBetweenEnglishWords lineContents
notParaBegin :: Parser String
notParaBegin = try (do
space
space
many1 space
return ""
) -- >= 3 spaces
<|> (space >> return "") -- 1 space
paragraph :: Parser String
paragraph = do
beg <- beginLine
cons <- many (choice [try emptyLine
,try pageLine
,try contentLine
,try brokenLine
,try optString])
lookAhead beginLine
<|> lookAhead titleLine
<|> lookAhead (eof >> return "")
return $ concat $ beg : cons
where
optString :: Parser String
optString = do
sp <- try (do
space
space
many1 space)
<|> (space >> return " ")
lc <- anyChar `manyTill` (try newline)
many (choice [try emptyLine, try pageLine])
lookAhead notParaBegin
<|> (eof >> return "")
return $ "\n" ++ sp ++ lc
titleLine :: Parser String
titleLine = do
notParaBegin -- 1 or 3+ spaces
lineContents <- word `sepEndBy1`
many (satisfy isSepSpace)
<?> "非段落起首空白数"
newline
many (choice [try emptyLine, try pageLine])
lookAhead beginLine
return $ concat
$ insertSpaceBetweenEnglishWords lineContents
article :: Parser String
article = do
t <- titleLine
ps <- many1 paragraph
many (choice [try emptyLine, try pageLine])
lookAhead titleLine
<|> (eof >> return "")
return $ intercalate "\n"( (t++"\n"):ps)
file :: Parser String
file = do
ts <- many1 contentLine
many (choice [try emptyLine, try pageLine])
as <- many1 article
lookAhead eof
return $ " " ++ concat ts ++ "\n\n"
++ intercalate "\n\n\n" as
fileTitle :: Parser String
fileTitle = do
ts <- anyChar `manyTill` (try titleLine)
return ts
-- simple test func
testParser :: Parser String -> String -> IO ()
testParser p input =
case (runParser p () "" input) of
Left err -> do
putStr "parse error at "
print err
Right x -> putStrLn x
这段代码处理排版比较乱的文本文件,但是我一直整理文件不成功,忘记从哪个大牛的网站上摘来的,抱歉!
最新推荐文章于 2024-09-25 00:10:17 发布