{-# LANGUAGE BangPatterns, EmptyDataDecls, ScopedTypeVariables #-}

-- |
-- Module      : Data.Text.ICU.Regex.Pure
-- Copyright   : (c) 2010 Bryan O'Sullivan
--
-- License     : BSD-style
-- Maintainer  : bos@serpentine.com
-- Stability   : experimental
-- Portability : GHC
--
-- Regular expression support for Unicode, implemented as bindings to
-- the International Components for Unicode (ICU) libraries.
--
-- The functions in this module are pure and hence thread safe, but
-- may not be as fast or as flexible as those in the
-- "Data.Text.ICU.Regex" module.
--
-- The syntax and behaviour of ICU regular expressions are Perl-like.
-- For complete details, see the ICU User Guide entry at
-- <http://userguide.icu-project.org/strings/regexp>.

module Data.Text.ICU.Regex.Pure
    (
    -- * Types
      MatchOption(..)
    , ParseError(errError, errLine, errOffset)
    , Match
    , Regex
    , Regular
    -- * Functions
    -- ** Construction
    , regex
    , regex'
    -- ** Inspection
    , pattern
    -- ** Searching
    , find
    , findAll
    -- ** Match groups
    -- $group
    , groupCount
    , unfold
    , span
    , group
    , prefix
    , suffix
    ) where

import qualified Control.Exception as E
import Data.String (IsString(..))
import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.Foreign as T
import Data.Text.ICU.Internal (TextI, fromUCharPtr, lengthWord, withUTextPtrText, utextPtrLength)
import Data.Text.ICU.Error.Internal (ParseError(..), handleError)
import qualified Data.Text.ICU.Regex as IO
import Data.Text.ICU.Regex.Internal hiding (Regex(..), regex)
import qualified Data.Text.ICU.Regex.Internal as Internal
import Foreign.ForeignPtr (ForeignPtr, withForeignPtr)
import Foreign.Marshal.Alloc (alloca)
import Foreign.Marshal.Array (advancePtr)
import Foreign.Storable (peek)
import Prelude hiding (span)
import System.IO.Unsafe (unsafeInterleaveIO, unsafePerformIO)

-- | A compiled regular expression.
--
-- 'Regex' values are usually constructed using the 'regex' or
-- 'regex'' functions.  This type is also an instance of 'IsString',
-- so if you have the @OverloadedStrings@ language extension enabled,
-- you can construct a 'Regex' by simply writing the pattern in
-- quotes (though this does not allow you to specify any 'Option's).
newtype Regex = Regex {
      Regex -> Regex
reRe :: Internal.Regex
    }

instance Show Regex where
    show :: Regex -> String
show Regex
re = String
"Regex " String -> ShowS
forall a. [a] -> [a] -> [a]
++ Text -> String
forall a. Show a => a -> String
show (Regex -> Text
forall r. Regular r => r -> Text
pattern Regex
re)

instance IsString Regex where
    fromString :: String -> Regex
fromString = [MatchOption] -> Text -> Regex
regex [] (Text -> Regex) -> (String -> Text) -> String -> Regex
forall b c a. (b -> c) -> (a -> b) -> a -> c
. String -> Text
T.pack

-- | A match for a regular expression.
data Match = Match {
      Match -> Regex
matchRe :: Internal.Regex
    , Match -> TextI
_matchPrev :: TextI
    }

instance Show Match where
    show :: Match -> String
show Match
m = String
"Match " String -> ShowS
forall a. [a] -> [a] -> [a]
++ [Text] -> String
forall a. Show a => a -> String
show ((Int -> Match -> Maybe Text) -> Match -> [Text]
unfold Int -> Match -> Maybe Text
group Match
m)

-- | A typeclass for functions common to both 'Match' and 'Regex'
-- types.
class Regular r where
    regRe :: r -> Internal.Regex

    regFp :: r -> ForeignPtr URegularExpression
    regFp = Regex -> ForeignPtr URegularExpression
Internal.reRe (Regex -> ForeignPtr URegularExpression)
-> (r -> Regex) -> r -> ForeignPtr URegularExpression
forall b c a. (b -> c) -> (a -> b) -> a -> c
. r -> Regex
forall r. Regular r => r -> Regex
regRe
    {-# INLINE regFp #-}

instance Regular Match where
    regRe :: Match -> Regex
regRe = Match -> Regex
matchRe

instance Regular Regex where
    regRe :: Regex -> Regex
regRe = Regex -> Regex
reRe

-- | Compile a regular expression with the given options.  This
-- function throws a 'ParseError' if the pattern is invalid, so it is
-- best for use when the pattern is statically known.
regex :: [MatchOption] -> Text -> Regex
regex :: [MatchOption] -> Text -> Regex
regex [MatchOption]
opts Text
pat = Regex -> Regex
Regex (Regex -> Regex) -> (IO Regex -> Regex) -> IO Regex -> Regex
forall b c a. (b -> c) -> (a -> b) -> a -> c
. IO Regex -> Regex
forall a. IO a -> a
unsafePerformIO (IO Regex -> Regex) -> IO Regex -> Regex
forall a b. (a -> b) -> a -> b
$ [MatchOption] -> Text -> IO Regex
IO.regex [MatchOption]
opts Text
pat

-- | Compile a regular expression with the given options.  This is
-- safest to use when the pattern is constructed at run time.
regex' :: [MatchOption] -> Text -> Either ParseError Regex
regex' :: [MatchOption] -> Text -> Either ParseError Regex
regex' [MatchOption]
opts Text
pat = IO (Either ParseError Regex) -> Either ParseError Regex
forall a. IO a -> a
unsafePerformIO (IO (Either ParseError Regex) -> Either ParseError Regex)
-> IO (Either ParseError Regex) -> Either ParseError Regex
forall a b. (a -> b) -> a -> b
$
  ((Regex -> Either ParseError Regex
forall a b. b -> Either a b
Right (Regex -> Either ParseError Regex)
-> (Regex -> Regex) -> Regex -> Either ParseError Regex
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Regex -> Regex
Regex) (Regex -> Either ParseError Regex)
-> IO Regex -> IO (Either ParseError Regex)
forall a b. (a -> b) -> IO a -> IO b
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
`fmap` [MatchOption] -> Text -> IO Regex
Internal.regex [MatchOption]
opts Text
pat) IO (Either ParseError Regex)
-> (ParseError -> IO (Either ParseError Regex))
-> IO (Either ParseError Regex)
forall e a. Exception e => IO a -> (e -> IO a) -> IO a
`E.catch`
  \(ParseError
err::ParseError) -> Either ParseError Regex -> IO (Either ParseError Regex)
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return (ParseError -> Either ParseError Regex
forall a b. a -> Either a b
Left ParseError
err)

-- | Return the source form of the pattern used to construct this
-- regular expression or match.
pattern :: Regular r => r -> Text
pattern :: forall r. Regular r => r -> Text
pattern r
r = IO Text -> Text
forall a. IO a -> a
unsafePerformIO (IO Text -> Text)
-> ((Ptr URegularExpression -> IO Text) -> IO Text)
-> (Ptr URegularExpression -> IO Text)
-> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ForeignPtr URegularExpression
-> (Ptr URegularExpression -> IO Text) -> IO Text
forall a b. ForeignPtr a -> (Ptr a -> IO b) -> IO b
withForeignPtr (r -> ForeignPtr URegularExpression
forall r. Regular r => r -> ForeignPtr URegularExpression
regFp r
r) ((Ptr URegularExpression -> IO Text) -> Text)
-> (Ptr URegularExpression -> IO Text) -> Text
forall a b. (a -> b) -> a -> b
$ \Ptr URegularExpression
rePtr ->
  (Ptr Int32 -> IO Text) -> IO Text
forall a b. Storable a => (Ptr a -> IO b) -> IO b
alloca ((Ptr Int32 -> IO Text) -> IO Text)
-> (Ptr Int32 -> IO Text) -> IO Text
forall a b. (a -> b) -> a -> b
$ \Ptr Int32
lenPtr -> do
    textPtr <- (Ptr UErrorCode -> IO (Ptr UChar)) -> IO (Ptr UChar)
forall a. (Ptr UErrorCode -> IO a) -> IO a
handleError ((Ptr UErrorCode -> IO (Ptr UChar)) -> IO (Ptr UChar))
-> (Ptr UErrorCode -> IO (Ptr UChar)) -> IO (Ptr UChar)
forall a b. (a -> b) -> a -> b
$ Ptr URegularExpression
-> Ptr Int32 -> Ptr UErrorCode -> IO (Ptr UChar)
uregex_pattern Ptr URegularExpression
rePtr Ptr Int32
lenPtr
    (fromUCharPtr textPtr . fromIntegral) =<< peek lenPtr

-- | Find the first match for the regular expression in the given text.
find :: Regex -> Text -> Maybe Match
find :: Regex -> Text -> Maybe Match
find Regex
re0 Text
haystack = IO (Maybe Match) -> Maybe Match
forall a. IO a -> a
unsafePerformIO (IO (Maybe Match) -> Maybe Match)
-> ((Regex -> IO (Maybe Match)) -> IO (Maybe Match))
-> (Regex -> IO (Maybe Match))
-> Maybe Match
forall b c a. (b -> c) -> (a -> b) -> a -> c
.
  Regex -> Text -> (Regex -> IO (Maybe Match)) -> IO (Maybe Match)
forall a. Regex -> Text -> (Regex -> IO a) -> IO a
matching Regex
re0 Text
haystack ((Regex -> IO (Maybe Match)) -> Maybe Match)
-> (Regex -> IO (Maybe Match)) -> Maybe Match
forall a b. (a -> b) -> a -> b
$ \Regex
re -> do
    m <- Regex -> IO Bool
IO.findNext Regex
re
    return $! if m then Just (Match re 0) else Nothing

-- | Lazily find all matches for the regular expression in the given
-- text.
findAll :: Regex -> Text -> [Match]
findAll :: Regex -> Text -> [Match]
findAll Regex
re0 Text
haystack = IO [Match] -> [Match]
forall a. IO a -> a
unsafePerformIO (IO [Match] -> [Match])
-> (IO [Match] -> IO [Match]) -> IO [Match] -> [Match]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. IO [Match] -> IO [Match]
forall a. IO a -> IO a
unsafeInterleaveIO (IO [Match] -> [Match]) -> IO [Match] -> [Match]
forall a b. (a -> b) -> a -> b
$ TextI -> IO [Match]
go TextI
0
  where
    len :: TextI
len = Int -> TextI
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Int -> TextI) -> (Text -> Int) -> Text -> TextI
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> Int
lengthWord (Text -> TextI) -> Text -> TextI
forall a b. (a -> b) -> a -> b
$ Text
haystack
    go :: TextI -> IO [Match]
go !TextI
n | TextI
n TextI -> TextI -> Bool
forall a. Ord a => a -> a -> Bool
>= TextI
len  = [Match] -> IO [Match]
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return []
          | Bool
otherwise = Regex -> Text -> (Regex -> IO [Match]) -> IO [Match]
forall a. Regex -> Text -> (Regex -> IO a) -> IO a
matching Regex
re0 Text
haystack ((Regex -> IO [Match]) -> IO [Match])
-> (Regex -> IO [Match]) -> IO [Match]
forall a b. (a -> b) -> a -> b
$ \Regex
re -> do
      found <- Regex -> TextI -> IO Bool
IO.find Regex
re TextI
n
      if found
        then do
          n' <- IO.end_ re 0
          (Match re n:) `fmap` go n'
        else return []

matching :: Regex -> Text -> (IO.Regex -> IO a) -> IO a
matching :: forall a. Regex -> Text -> (Regex -> IO a) -> IO a
matching (Regex Regex
re0) Text
haystack Regex -> IO a
act = do
  re <- Regex -> IO Regex
IO.clone Regex
re0
  IO.setText re haystack
  act re

-- $group
--
-- Capturing groups are numbered starting from zero.  Group zero is
-- always the entire matching text.  Groups greater than zero contain
-- the text matching each capturing group in a regular expression.

-- | Return the number of capturing groups in this regular
-- expression or match's pattern.
groupCount :: Regular r => r -> Int
groupCount :: forall r. Regular r => r -> Int
groupCount = IO Int -> Int
forall a. IO a -> a
unsafePerformIO (IO Int -> Int) -> (r -> IO Int) -> r -> Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Regex -> IO Int
IO.groupCount (Regex -> IO Int) -> (r -> Regex) -> r -> IO Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
. r -> Regex
forall r. Regular r => r -> Regex
regRe
{-# INLINE groupCount #-}

-- | A combinator for returning a list of all capturing groups on a
-- 'Match'.
unfold :: (Int -> Match -> Maybe Text) -> Match -> [Text]
unfold :: (Int -> Match -> Maybe Text) -> Match -> [Text]
unfold Int -> Match -> Maybe Text
f Match
m = Int -> [Text]
go Int
0
  where go :: Int -> [Text]
go !Int
n = case Int -> Match -> Maybe Text
f Int
n Match
m of
                  Maybe Text
Nothing -> []
                  Just Text
z  -> Text
z Text -> [Text] -> [Text]
forall a. a -> [a] -> [a]
: Int -> [Text]
go (Int
nInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
1)

-- | Return the /n/th capturing group in a match, or 'Nothing' if /n/
-- is out of bounds.
group :: Int -> Match -> Maybe Text
group :: Int -> Match -> Maybe Text
group Int
n Match
m = Int -> Match -> (Regex -> IO Text) -> Maybe Text
forall a. Int -> Match -> (Regex -> IO a) -> Maybe a
grouping Int
n Match
m ((Regex -> IO Text) -> Maybe Text)
-> (Regex -> IO Text) -> Maybe Text
forall a b. (a -> b) -> a -> b
$ \Regex
re -> do
  let n' :: Int
n' = Int -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
n
  start <- TextI -> TextI
forall a b. (Integral a, Num b) => a -> b
fromIntegral (TextI -> TextI) -> IO TextI -> IO TextI
forall a b. (a -> b) -> IO a -> IO b
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
`fmap` Regex -> Int -> IO TextI
IO.start_ Regex
re Int
n'
  end <- fromIntegral `fmap` IO.end_ re n'
  ut <- IO.getUTextPtr re
  withUTextPtrText ut $ \Ptr Word8
ptr ->
    Ptr Word8 -> TextI -> IO Text
T.fromPtr (Ptr Word8
ptr Ptr Word8 -> Int -> Ptr Word8
forall a. Storable a => Ptr a -> Int -> Ptr a
`advancePtr` TextI -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral TextI
start) (TextI
end TextI -> TextI -> TextI
forall a. Num a => a -> a -> a
- TextI
start)

-- | Return the prefix of the /n/th capturing group in a match (the
-- text from the start of the string to the start of the match), or
-- 'Nothing' if /n/ is out of bounds.
prefix :: Int -> Match -> Maybe Text
prefix :: Int -> Match -> Maybe Text
prefix Int
n Match
m = Int -> Match -> (Regex -> IO Text) -> Maybe Text
forall a. Int -> Match -> (Regex -> IO a) -> Maybe a
grouping Int
n Match
m ((Regex -> IO Text) -> Maybe Text)
-> (Regex -> IO Text) -> Maybe Text
forall a b. (a -> b) -> a -> b
$ \Regex
re -> do
  start <- TextI -> TextI
forall a b. (Integral a, Num b) => a -> b
fromIntegral (TextI -> TextI) -> IO TextI -> IO TextI
forall a b. (a -> b) -> IO a -> IO b
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
`fmap` Regex -> Int -> IO TextI
IO.start_ Regex
re Int
n
  ut <- IO.getUTextPtr re
  withUTextPtrText ut (`T.fromPtr` start)

-- | Return the span of text between the end of the previous match and
-- the beginning of the current match.
span :: Match -> Text
span :: Match -> Text
span (Match Regex
re TextI
p) = IO Text -> Text
forall a. IO a -> a
unsafePerformIO (IO Text -> Text) -> IO Text -> Text
forall a b. (a -> b) -> a -> b
$ do
  start <- Regex -> Int -> IO TextI
IO.start_ Regex
re Int
0
  ut <- IO.getUTextPtr re
  withUTextPtrText ut $ \Ptr Word8
ptr ->
    Ptr Word8 -> TextI -> IO Text
T.fromPtr (Ptr Word8
ptr Ptr Word8 -> Int -> Ptr Word8
forall a. Storable a => Ptr a -> Int -> Ptr a
`advancePtr` TextI -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral TextI
p) (TextI
start TextI -> TextI -> TextI
forall a. Num a => a -> a -> a
- TextI
p)

-- | Return the suffix of the /n/th capturing group in a match (the
-- text from the end of the match to the end of the string), or
-- 'Nothing' if /n/ is out of bounds.
suffix :: Int -> Match -> Maybe Text
suffix :: Int -> Match -> Maybe Text
suffix Int
n Match
m = Int -> Match -> (Regex -> IO Text) -> Maybe Text
forall a. Int -> Match -> (Regex -> IO a) -> Maybe a
grouping Int
n Match
m ((Regex -> IO Text) -> Maybe Text)
-> (Regex -> IO Text) -> Maybe Text
forall a b. (a -> b) -> a -> b
$ \Regex
re -> do
  end <- TextI -> TextI
forall a b. (Integral a, Num b) => a -> b
fromIntegral (TextI -> TextI) -> IO TextI -> IO TextI
forall a b. (a -> b) -> IO a -> IO b
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
`fmap` Regex -> Int -> IO TextI
IO.end_ Regex
re Int
n
  ut <- IO.getUTextPtr re
  withUTextPtrText ut $ \Ptr Word8
ptr -> do
    Ptr Word8 -> TextI -> IO Text
T.fromPtr (Ptr Word8
ptr Ptr Word8 -> Int -> Ptr Word8
forall a. Storable a => Ptr a -> Int -> Ptr a
`advancePtr` TextI -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral TextI
end) (UTextPtr -> TextI
utextPtrLength UTextPtr
ut TextI -> TextI -> TextI
forall a. Num a => a -> a -> a
- TextI
end)

grouping :: Int -> Match -> (Internal.Regex -> IO a) -> Maybe a
grouping :: forall a. Int -> Match -> (Regex -> IO a) -> Maybe a
grouping Int
n (Match Regex
m TextI
_) Regex -> IO a
act = IO (Maybe a) -> Maybe a
forall a. IO a -> a
unsafePerformIO (IO (Maybe a) -> Maybe a) -> IO (Maybe a) -> Maybe a
forall a b. (a -> b) -> a -> b
$ do
  count <- Regex -> IO Int
IO.groupCount Regex
m
  let n' = Int -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
n
  if n' == 0 || (n' >= 0 && n' <= count)
    then Just `fmap` act m
    else return Nothing