IRI.hs revision c437ed4dfe7d1452abfef361cc82fb614a945745
{-# LANGUAGE CPP, DeriveDataTypeable #-}
--------------------------------------------------------------------------------
-- |
-- Module : Common.IRI
-- Copyright : (c) 2004, Graham Klyne
-- License : BSD-style (see end of this file)
--
-- Maintainer : Eugen Kuksa <eugenk@informatik.uni-bremen.de>
-- Stability : provisional
-- Portability : portable
--
-- This module defines functions for handling IRIs. It is substantially the
-- same as the Network.URI module by Graham Klyne, but is extended to IRI
-- support [2] and even Manchester-Syntax-IRI [3], [4].
--
-- Four methods are provided for parsing different
-- kinds of IRI string (as noted in [1], [2]):
-- 'parseIRI',
-- 'parseIRIReference',
-- 'parseRelativeReference' and
-- 'parseAbsoluteIRI'.
--
-- An addotional method is provided for parsing an abbreviated IRI according to [3], [4]:
-- 'parseIRIManchester'
--
-- Further, four methods are provided for classifying different
-- kinds of IRI string (as noted in [1], [2]):
-- 'isIRI',
-- 'isIRIReference',
-- 'isRelativeReference' and
-- 'isAbsoluteIRI'.
--
-- Additionally, classification of full, abbreviated and simple IRI is provided
-- by
-- 'isIRIManchester'.
--
-- The Manchester-syntax [3], [4] provdies three different kinds of IRI: full,
-- abbreviated and simple. An existing element of type IRI can be classified in
-- one of those kinds with
-- 'iriType'.
--
-- Most of the code has been copied from the Network.URI implementation,
-- but it is extended to IRI and Manchester-syntax.
--
--
-- References
--
-- (1) <http://www.ietf.org/rfc/rfc3986.txt>
--
-- (2) <http://www.ietf.org/rfc/rfc3987.txt>
--
-- (3) <http://www.w3.org/TR/2009/NOTE-owl2-manchester-syntax-20091027/>
--
-- (4) <http://www.w3.org/TR/2008/REC-rdf-sparql-query-20080115/>
--
--------------------------------------------------------------------------------
module Common.IRI
(
-- * The IRI type
IRI(..)
, IRIAuth(..)
, IRIType(..)
, nullIRI
, iriType
-- * Parsing
, parseIRI
, parseIRIReference
, parseRelativeReference
, parseAbsoluteIRI
, parseIRIManchester
-- * Test for strings containing various kinds of IRI
, isIRI
, isIRIReference
, isRelativeReference
, isAbsoluteIRI
, isIPv6address
, isIPv4address
-- * Relative IRIs
, relativeTo
, nonStrictRelativeTo
, relativeFrom
-- * Operations on IRI strings
-- | Support for putting strings into IRI-friendly
-- escaped format and getting them back again.
, iriToString
, iriToStringUnsecure
, isReserved, isUnreserved
, isAllowedInIRI, isUnescapedInIRI
, escapeIRIChar
, escapeIRIString
, unEscapeString
-- * Parser combinators, special additions to export list
, iri
, iriReference
, irelativeRef
, absoluteIRI
, iriManchester
-- * IRI Normalization functions
, normalizeCase
, normalizeEscape
, normalizePathSegments
) where
import Text.ParserCombinators.Parsec
( GenParser(..), ParseError(..)
, parse, (<|>), (<?>), try
, option, many, many1, count, notFollowedBy, lookAhead
, char, satisfy, oneOf, string, letter, digit, hexDigit, eof
, unexpected
)
import Control.Monad (MonadPlus(..))
import Data.Char (ord, chr, isHexDigit, isSpace, toLower, toUpper, digitToInt)
import Data.Maybe (isJust)
import Debug.Trace (trace)
import Numeric (showIntAtBase)
import Data.Typeable (Typeable)
import Data.Data (Data)
------------------------------------------------------------
-- The IRI datatype
------------------------------------------------------------
-- |Represents a general universal resource identifier using
-- its component parts.
--
-- For example, for the (full) IRI
--
-- > foo://anonymous@www.haskell.org:42/ghc?query#frag
--
-- or the abbreviated IRI
--
-- > prefix:abbrevPath
--
-- or the simple IRI
--
-- > abbrevPath
--
-- the components are:
--
data IRI = IRI
{ iriScheme :: String -- ^ @foo:@
, iriAuthority :: Maybe IRIAuth -- ^ @\/\/anonymous\@www.haskell.org:42@
, iriPath :: String -- ^ local part @\/ghc@
, iriQuery :: String -- ^ @?query@
, iriFragment :: String -- ^ @#frag@
, prefixName :: String -- ^ @prefix@
, abbrevPath :: String -- ^ @abbrevPath@
-- ^ prefix name part from "prefixName:path"
} deriving (Eq, Typeable, Data, Ord, Show)
-- |Type for authority value within a IRI
data IRIAuth = IRIAuth
{ iriUserInfo :: String -- ^ @anonymous\@@
, iriRegName :: String -- ^ @www.haskell.org@
, iriPort :: String -- ^ @:42@
} deriving (Eq, Typeable, Data, Ord, Show)
data IRIType = Full | Abbreviated | Simple
deriving (Eq, Show, Typeable, Data, Ord)
-- |Blank IRI
nullIRI :: IRI
nullIRI = IRI
{ iriScheme = ""
, iriAuthority = Nothing
, iriPath = ""
, iriQuery = ""
, iriFragment = ""
, prefixName = ""
, abbrevPath = ""
}
-- |Returns Type of an IRI
iriType :: IRI -> IRIType
iriType i =
if (not.null) $ iriPath i then Full else
if null $ prefixName i then Simple else Abbreviated
-- IRI as instance of Show. Note that for secirity reasons, the default
-- behaviour is to suppress any iuserinfo field (see RFC3986, section 7.5).
-- This can be overridden by using iriToString directly with first
-- argument @id@ (noting that this returns a ShowS value rather than a string).
--
-- [[[Another design would be to embed the iuserinfo mapping function in
-- the IRIAuth value, with the default value suppressing iuserinfo formatting,
-- but providing a function to return a new IRI value with iuserinfo
-- data exposed by show.]]]
--
-- instance Show IRI where
-- showsPrec _ iri = iriToString defaultUserInfoMap iri
defaultUserInfoMap :: String -> String
defaultUserInfoMap uinf = user++newpass
where
(user,pass) = break (==':') uinf
newpass = if null pass || (pass == "@")
|| (pass == ":@")
then pass
else ":...@"
iriToStringUnsecure :: IRI -> String
iriToStringUnsecure i = (iriToString id i) ""
------------------------------------------------------------
-- Parse a IRI
------------------------------------------------------------
-- |Turn a string containing an RFC3987 IRI into a 'IRI'.
-- Returns 'Nothing' if the string is not a valid IRI;
-- (an absolute IRI with optional fragment identifier).
--
parseIRI :: String -> Maybe IRI
parseIRI = parseIRIAny iri
-- |Parse a IRI reference to a 'IRI' value.
-- Returns 'Nothing' if the string is not a valid IRI reference.
-- (an absolute or relative IRI with optional fragment identifier).
--
parseIRIReference :: String -> Maybe IRI
parseIRIReference = parseIRIAny iriReference
-- |Parse a relative IRI to a 'IRI' value.
-- Returns 'Nothing' if the string is not a valid relative IRI.
-- (a relative IRI with optional fragment identifier).
--
parseRelativeReference :: String -> Maybe IRI
parseRelativeReference = parseIRIAny irelativeRef
-- |Parse an absolute IRI to a 'IRI' value.
-- Returns 'Nothing' if the string is not a valid absolute IRI.
-- (an absolute IRI without a fragment identifier).
--
parseAbsoluteIRI :: String -> Maybe IRI
parseAbsoluteIRI = parseIRIAny absoluteIRI
-- |Turn a string containing an IRI (by Manchester-syntax) into a 'IRI'.
-- Returns 'Nothing' if the string is not a valid IRI;
-- (an absolute IRI enclosed in '<' and '>' with optional fragment identifier,
-- an abbreviated IRI or a simple IRI).
--
parseIRIManchester :: String -> Maybe IRI
parseIRIManchester = parseIRIAny iriManchester
-- |Test if string contains a valid IRI
-- (an absolute IRI with optional fragment identifier).
--
isIRI :: String -> Bool
isIRI = isValidParse iri
-- |Test if string contains a valid IRI reference
-- (an absolute or relative IRI with optional fragment identifier).
--
isIRIReference :: String -> Bool
isIRIReference = isValidParse iriReference
-- |Test if string contains a valid relative IRI
-- (a relative IRI with optional fragment identifier).
--
isRelativeReference :: String -> Bool
isRelativeReference = isValidParse irelativeRef
-- |Test if string contains a valid absolute IRI
-- (an absolute IRI without a fragment identifier).
--
isAbsoluteIRI :: String -> Bool
isAbsoluteIRI = isValidParse absoluteIRI
-- |Test if string contains a valid IRI by Manchester-syntax
-- (an absolute IRI enclosed in '<' and '>' with optional fragment identifier,
-- an abbreviated IRI or a simple IRI).
--
isIRIManchester :: String -> Bool
isIRIManchester = isValidParse iriManchester
-- |Test if string contains a valid IPv6 address
--
isIPv6address :: String -> Bool
isIPv6address = isValidParse ipv6address
-- |Test if string contains a valid IPv4 address
--
isIPv4address :: String -> Bool
isIPv4address = isValidParse ipv4address
-- |Test function: parse and reconstruct a IRI reference
--
testIRIReference :: String -> String
testIRIReference iristr = show (parseAll iriReference "" iristr)
-- Helper function for turning a string into a IRI
--
parseIRIAny :: IRIParserDirect IRI -> String -> Maybe IRI
parseIRIAny parser iristr = case parseAll parser "" iristr of
Left _ -> Nothing
Right u -> Just u
-- Helper function to test a string match to a parser
--
isValidParse :: IRIParserDirect a -> String -> Bool
isValidParse parser iristr = case parseAll parser "" iristr of
-- Left e -> error (show e)
Left _ -> False
Right u -> True
parseAll :: IRIParserDirect a -> String -> String -> Either ParseError a
parseAll parser filename iristr = parse newparser filename iristr
where
newparser =
do { res <- parser
; eof
; return res
}
------------------------------------------------------------
-- IRI parser body based on Parsec elements and combinators
------------------------------------------------------------
-- Parser parser type.
-- Currently
type IRIParserDirect a = GenParser Char () a
type IRIParser st a = GenParser Char st a
-- RFC3986, section 2.1
--
-- Parse and return a 'pct-encoded' sequence
--
escaped :: IRIParser st String
escaped =
do { char '%'
; h1 <- hexDigitChar
; h2 <- hexDigitChar
; return $ ['%',h1,h2]
}
-- RFC3986, section 2.2
--
-- |Returns 'True' if the character is a \"reserved\" character in a
-- IRI. To include a literal instance of one of these characters in a
-- component of a IRI, it must be escaped.
--
isReserved :: Char -> Bool
isReserved c = isGenDelims c || isSubDelims c
isGenDelims c = c `elem` ":/?#[]@"
isSubDelims c = c `elem` "!$&'()*+,;="
genDelims :: IRIParser st String
genDelims = do { c <- satisfy isGenDelims ; return [c] }
subDelims :: IRIParser st String
subDelims = do { c <- satisfy isSubDelims ; return [c] }
-- RFC3986, section 2.3
--
-- |Returns 'True' if the character is an \"unreserved\" character in
-- a IRI. These characters do not need to be escaped in a IRI. The
-- only characters allowed in a IRI are either \"reserved\",
-- \"unreserved\", or an escape sequence (@%@ followed by two hex digits).
--
isUnreserved :: Char -> Bool
isUnreserved c = isAlphaNumChar c || (c `elem` "-_.~") || (isUcsChar c)
iunreservedChar :: IRIParser st String
iunreservedChar = do { c <- satisfy isUnreserved ; return [c] }
{- BEGIN CURIE -}
-- http://www.w3.org/TR/rdfa-core/#s_curies
curie :: IRIParser st IRI
curie = do
c <- string ":"
i <- reference
return $ i { prefixName = c }
<|> do
p <- try (do
n <- ncname
c <- string ":"
return $ n++c
)
i <- reference
return $ i { prefixName = p }
<|> reference
reference :: IRIParser st IRI
reference = do
(_, up) <- ihierPartNoAuth
uq <- option "" ( do { char '?' ; uiquery } )
uf <- option "" ( do { char '#' ; uifragment } )
return $ IRI
{ iriScheme = ""
, iriAuthority = Nothing
, iriPath = ""
, iriQuery = uq
, iriFragment = uf
, prefixName = ""
, abbrevPath = up
}
-- http://www.w3.org/TR/2009/REC-xml-names-20091208/#NT-NCName
ncname :: GenParser Char st String
ncname = do
c <- nameStartChar
s <- many nameChar
return $ c:s
nameStartChar :: GenParser Char st Char
nameStartChar = satisfy nameStartCharP
nameChar :: GenParser Char st Char
nameChar = satisfy nameCharP
-- NOTE: Usually ':' is allowed. Here, only ncname uses nameStartChar, however.
-- Thus we disallow ':'
nameStartCharP :: Char -> Bool
nameStartCharP c =
let n = ord c in
(c == '_') || -- usually: (c `elem` ":_") ||
isAlphaChar c ||
(0x00C0 <= n && n <= 0x00D6) ||
(0x00D8 <= n && n <= 0x00F6) ||
(0x00F8 <= n && n <= 0x02FF) ||
(0x0370 <= n && n <= 0x037D) ||
(0x037F <= n && n <= 0x1FFF) ||
(0x200C <= n && n <= 0x200D) ||
(0x2070 <= n && n <= 0x218F) ||
(0x2C00 <= n && n <= 0x2FEF) ||
(0x3001 <= n && n <= 0xD7FF) ||
(0xF900 <= n && n <= 0xFDCF) ||
(0xFDF0 <= n && n <= 0xFFFD) ||
(0x10000 <= n && n <= 0xEFFFF)
nameCharP :: Char -> Bool
nameCharP c =
let n = ord c in
nameStartCharP c ||
isDigitChar c ||
c `elem` "-." ||
n == 0xB7 ||
(0x0300 <= n && n <= 0x036F) ||
(0x203F <= n && n <= 0x2040)
{- END CURIE -}
{- BEGIN SPARQL -}
-- http://www.w3.org/TR/2008/REC-rdf-sparql-query-20080115/
-- Section 4.1
pn_chars_baseP :: Char -> Bool
pn_chars_baseP c =
let n = ord c in
isAlphaChar c ||
(0x00C0 <= n && n <= 0x00D6) ||
(0x00D8 <= n && n <= 0x00F6) ||
(0x00F8 <= n && n <= 0x02FF) ||
(0x0370 <= n && n <= 0x037D) ||
(0x037F <= n && n <= 0x1FFF) ||
(0x200C <= n && n <= 0x200D) ||
(0x2070 <= n && n <= 0x218F) ||
(0x2C00 <= n && n <= 0x2FEF) ||
(0x00D8 <= n && n <= 0x00F6) ||
(0x3001 <= n && n <= 0xD7FF) ||
(0xF900 <= n && n <= 0xFDCF) ||
(0xFDF0 <= n && n <= 0xFFFD) ||
(0x10000 <= n && n <= 0xEFFFF)
pn_chars_base :: GenParser Char st Char
pn_chars_base = satisfy pn_chars_baseP
pn_chars_u :: GenParser Char st Char
pn_chars_u = satisfy pn_chars_uP
pn_chars :: GenParser Char st Char
pn_chars = satisfy pn_charsP
pn_chars_uP :: Char -> Bool
pn_chars_uP c = (pn_chars_baseP c) || (c == '_')
pn_charsP :: Char -> Bool
pn_charsP c =
let n = ord c in
c == '-' ||
pn_chars_uP c ||
isDigitChar c ||
(n == 0x00B7) ||
(0x0300 <= n && n <= 0x036F) ||
(0x203F <= n && n <= 0x2040)
-- http://www.w3.org/TR/2009/NOTE-owl2-manchester-syntax-20091027/
-- section 2.1
-- fullIRI := an IRI as defined in [RFC 3987], enclosed in a pair of < (U+3C) and > (U+3E) characters
-- prefixName := a finite sequence of characters matching the PNAME_NS production of [SPARQL] and not matching any of the keyword terminals of the syntax
-- abbreviatedIRI := a finite sequence of characters matching the PNAME_LN production of [SPARQL]
-- simpleIRI := a finite sequence of characters matching the PN_LOCAL production of [SPARQL] and not matching any of the keyword terminals of the syntax
-- IRI := fullIRI | abbreviatedIRI | simpleIRI
iriManchester :: IRIParser st IRI
iriManchester = do
char '<'
i <- iri <|> irelativeRef
char '>'
return i
<|> do
(PName_Ln prefix loc) <- try pname_ln
return $ IRI
{ iriScheme = ""
, iriAuthority = Nothing
, iriPath = ""
, iriQuery = ""
, iriFragment = ""
, prefixName = prefix
, abbrevPath = loc
}
<|> do
loc <- pn_local
return $ IRI
{ iriScheme = ""
, iriAuthority = Nothing
, iriPath = ""
, iriQuery = ""
, iriFragment = ""
, prefixName = ""
, abbrevPath = loc
}
data PNAME_LN = PName_Ln PNAME_NS PN_LOCAL deriving (Show, Eq, Ord)
type PNAME_NS = String
type PN_PREFIX = String
type PN_LOCAL = String
pname_ln :: GenParser Char st PNAME_LN
pname_ln = do
ns <- pname_ns
loc <- pn_local
return $ PName_Ln ns loc
pname_ns :: GenParser Char st PNAME_NS
pname_ns = do
char ':'
return ":"
<|> do
prefix <- pn_prefix
char ':'
return $ prefix ++ ":"
pn_prefix :: GenParser Char st PN_PREFIX
pn_prefix = do
c1 <- pn_chars_base
t <- (
do
s1 <- many (pn_chars <|> char '.')
if null s1 then return Nothing else case last s1 of
'.' -> fail "Last character in prefix must not be '.'"
_ -> return $ Just s1
<|> return Nothing
)
case t of
Just str -> return $ c1:str
Nothing -> return [c1]
pn_local :: GenParser Char st PN_LOCAL
pn_local = do
c1 <- (pn_chars_u <|> digit)
t <- (
do
s1 <- many (pn_chars <|> char '.')
if null s1 then return Nothing else case last s1 of
'.' -> fail "Last character in prefix must not be '.'"
_ -> return $ Just s1
<|> return Nothing
)
case t of
Just str -> return $ c1:str
Nothing -> return [c1]
{- END SPARQL -}
-- RFC3987, section 2.2
--
-- IRI = scheme ":" ihier-part [ "?" iquery ] [ "#" ifragment ]
--
-- ihier-part = "//" iauthority ipath-abempty
-- / ipath-absolute
-- / ipath-rootless
-- / ipath-empty
iri :: IRIParser st IRI
iri =
do { us <- try uscheme
-- ; ua <- option Nothing ( do { try (string "//") ; uiauthority } )
-- ; up <- upath
; (ua,up) <- ihierPart
; uq <- option "" ( do { char '?' ; uiquery } )
; uf <- option "" ( do { char '#' ; uifragment } )
; return $ IRI
{ iriScheme = us
, iriAuthority = ua
, iriPath = up
, iriQuery = uq
, iriFragment = uf
, prefixName = ""
, abbrevPath = ""
}
}
ihierPart :: IRIParser st ((Maybe IRIAuth),String)
ihierPart =
do { try (string "//")
; ua <- uiauthority
; up <- ipathAbEmpty
; return (ua,up)
}
<|> ihierPartNoAuth
ihierPartNoAuth :: IRIParser st ((Maybe IRIAuth),String)
ihierPartNoAuth =
do { up <- ipathAbs
; return (Nothing, up)
}
<|> do { up <- ipathRootLess
; return (Nothing, up)
}
<|> do { return (Nothing, "")
}
-- RFC3986, section 3.1
uscheme :: IRIParser st String
uscheme =
do { s <- oneThenMany alphaChar (satisfy isSchemeChar)
; char ':'
; return $ s++":"
}
-- RFC3987, section 2.2
uiauthority :: IRIParser st (Maybe IRIAuth)
uiauthority =
do { uu <- option "" (try iuserinfo)
; uh <- ihost
; up <- option "" port
; return $ Just $ IRIAuth
{ iriUserInfo = uu
, iriRegName = uh
, iriPort = up
}
}
-- RFC3987, section 2.2
iuserinfo :: IRIParser st String
iuserinfo =
do { uu <- many (uchar ";:&=+$,")
; char '@'
; return (concat uu ++"@")
}
-- RFC3987, section 2.2
ihost :: IRIParser st String
ihost = ipLiteral <|> try ipv4address <|> iregName
ipLiteral :: IRIParser st String
ipLiteral =
do { char '['
; ua <- ( ipv6address <|> ipvFuture )
; char ']'
; return $ "[" ++ ua ++ "]"
}
<?> "IP address literal"
ipvFuture :: IRIParser st String
ipvFuture =
do { char 'v'
; h <- hexDigitChar
; char '.'
; a <- many1 (satisfy isIpvFutureChar)
; return $ 'c':h:'.':a
}
isIpvFutureChar c = isUnreserved c || isSubDelims c || (c==';')
ipv6address :: IRIParser st String
ipv6address =
try ( do
{ a2 <- count 6 h4c
; a3 <- ls32
; return $ concat a2 ++ a3
} )
<|> try ( do
{ string "::"
; a2 <- count 5 h4c
; a3 <- ls32
; return $ "::" ++ concat a2 ++ a3
} )
<|> try ( do
{ a1 <- opt_n_h4c_h4 0
; string "::"
; a2 <- count 4 h4c
; a3 <- ls32
; return $ a1 ++ "::" ++ concat a2 ++ a3
} )
<|> try ( do
{ a1 <- opt_n_h4c_h4 1
; string "::"
; a2 <- count 3 h4c
; a3 <- ls32
; return $ a1 ++ "::" ++ concat a2 ++ a3
} )
<|> try ( do
{ a1 <- opt_n_h4c_h4 2
; string "::"
; a2 <- count 2 h4c
; a3 <- ls32
; return $ a1 ++ "::" ++ concat a2 ++ a3
} )
<|> try ( do
{ a1 <- opt_n_h4c_h4 3
; string "::"
; a2 <- h4c
; a3 <- ls32
; return $ a1 ++ "::" ++ a2 ++ a3
} )
<|> try ( do
{ a1 <- opt_n_h4c_h4 4
; string "::"
; a3 <- ls32
; return $ a1 ++ "::" ++ a3
} )
<|> try ( do
{ a1 <- opt_n_h4c_h4 5
; string "::"
; a3 <- h4
; return $ a1 ++ "::" ++ a3
} )
<|> try ( do
{ a1 <- opt_n_h4c_h4 6
; string "::"
; return $ a1 ++ "::"
} )
<?> "IPv6 address"
opt_n_h4c_h4 :: Int -> IRIParser st String
opt_n_h4c_h4 n = option "" $
do { a1 <- countMinMax 0 n h4c
; a2 <- h4
; return $ concat a1 ++ a2
}
ls32 :: IRIParser st String
ls32 = try ( do
{ a1 <- h4c
; a2 <- h4
; return (a1++a2)
} )
<|> ipv4address
h4c :: IRIParser st String
h4c = try $
do { a1 <- h4
; char ':'
; notFollowedBy (char ':')
; return $ a1 ++ ":"
}
h4 :: IRIParser st String
h4 = countMinMax 1 4 hexDigitChar
ipv4address :: IRIParser st String
ipv4address =
do { a1 <- decOctet ; char '.'
; a2 <- decOctet ; char '.'
; a3 <- decOctet ; char '.'
; a4 <- decOctet
; return $ a1++"."++a2++"."++a3++"."++a4
}
decOctet :: IRIParser st String
decOctet =
do { a1 <- countMinMax 1 3 digitChar
; if read a1 > 255 then
fail "Decimal octet value too large"
else
return a1
}
iregName :: IRIParser st String
iregName =
do { ss <- countMinMax 0 255 ( iunreservedChar <|> escaped <|> subDelims )
; return $ concat ss
}
<?> "Registered name"
-- RFC3986, section 3.2.3
port :: IRIParser st String
port =
do { char ':'
; p <- many digitChar
; return (':':p)
}
--
-- RFC3987, section 2.2
--
-- ipath = ipath-abempty ; begins with "/" or is empty
-- / ipath-absolute ; begins with "/" but not "//"
-- / ipath-noscheme ; begins with a non-colon isegment
-- / ipath-rootless ; begins with a isegment
-- / ipath-empty ; zero characters
--
-- ipath-abempty = *( "/" iisegment )
-- ipath-absolute = "/" [ iisegment-nz *( "/" iisegment ) ]
-- ipath-noscheme = iisegment-nz-nc *( "/" iisegment )
-- ipath-rootless = iisegment-nz *( "/" iisegment )
-- ipath-empty = 0<iipchar>
--
-- iisegment = *iipchar
-- iisegment-nz = 1*iipchar
-- iisegment-nz-nc = 1*( iunreserved / pct-encoded / sub-delims
-- / "@" )
-- ; non-zero-length isegment without any colon ":"
--
-- iipchar = iunreserved / pct-encoded / sub-delims / ":"
-- / "@"
ipathAbEmpty :: IRIParser st String
ipathAbEmpty =
do { ss <- many slashIsegment
; return $ concat ss
}
ipathAbs :: IRIParser st String
ipathAbs =
do { char '/'
; ss <- option "" ipathRootLess
; return $ '/':ss
}
ipathRootLess :: IRIParser st String
ipathRootLess =
do { s1 <- isegmentNz
; ss <- many slashIsegment
; return $ concat (s1:ss)
}
ipathNoScheme :: IRIParser st String
ipathNoScheme =
do { s1 <- isegmentNzc
; ss <- many slashIsegment
; return $ concat (s1:ss)
}
slashIsegment :: IRIParser st String
slashIsegment =
do { char '/'
; s <- isegment
; return ('/':s)
}
isegment :: IRIParser st String
isegment =
do { ps <- many ipchar
; return $ concat ps
}
isegmentNz :: IRIParser st String
isegmentNz =
do { ps <- many1 ipchar
; return $ concat ps
}
isegmentNzc :: IRIParser st String
isegmentNzc =
do { ps <- many1 (uchar "@")
; return $ concat ps
}
ipchar :: IRIParser st String
ipchar = uchar ":@"
-- helper function for ipchar and friends
uchar :: String -> IRIParser st String
uchar extras =
iunreservedChar
<|> escaped
<|> subDelims
<|> do { c <- oneOf extras ; return [c] }
-- RFC3987, section 2.2
uiquery :: IRIParser st String
uiquery =
do { ss <- many iqueryPart
; return $ '?':concat ss
}
iqueryPart :: IRIParser st String
iqueryPart = (many1 iprivate) <|> (uchar $ ":@" ++ "/?")
-- RFC3987, section 2.2
uifragment :: IRIParser st String
uifragment =
do { ss <- many $ uchar (":@"++"/?")
; return $ '#':concat ss
}
-- Reference, Relative and Absolute IRI forms
--
-- RFC3987, section 2.2
iriReference :: IRIParser st IRI
iriReference = iri <|> irelativeRef
-- RFC3987, section 2.2
--
-- irelative-ref = irelative-part [ "?" iquery ] [ "#" ifragment ]
--
-- irelative-part = "//" iauthority ipath-abempty
-- / ipath-absolute
irelativeRef :: IRIParser st IRI
irelativeRef =
do { notMatching uscheme
; (ua,up) <- irelativePart
; uq <- option "" ( do { char '?' ; uiquery } )
; uf <- option "" ( do { char '#' ; uifragment } )
; return $ IRI
{ iriScheme = ""
, iriAuthority = ua
, iriPath = up
, iriQuery = uq
, iriFragment = uf
, prefixName = ""
, abbrevPath = ""
}
}
irelativePart :: IRIParser st ((Maybe IRIAuth),String)
irelativePart =
do { try (string "//")
; ua <- uiauthority
; up <- ipathAbEmpty
; return (ua,up)
}
<|> do { up <- ipathAbs
; return (Nothing,up)
}
<|> do { up <- ipathNoScheme
; return (Nothing,up)
}
<|> do { return (Nothing,"")
}
-- RFC3987, section 2.2
absoluteIRI :: IRIParser st IRI
absoluteIRI =
do { us <- uscheme
-- ; ua <- option Nothing ( do { try (string "//") ; uiauthority } )
-- ; up <- upath
; (ua,up) <- ihierPart
; uq <- option "" ( do { char '?' ; uiquery } )
; return $ IRI
{ iriScheme = us
, iriAuthority = ua
, iriPath = up
, iriQuery = uq
, iriFragment = ""
, prefixName = ""
, abbrevPath = ""
}
}
-- Imports from RFC 2234
-- NOTE: can't use isAlphaNum etc. because these deal with ISO 8859
-- (and possibly Unicode!) chars.
-- [[[Above was a comment originally in GHC Network/IRI.hs:
-- when IRIs are introduced then most codepoints above 128(?) should
-- be treated as unreserved, and higher codepoints for letters should
-- certainly be allowed.
-- ]]]
isAlphaChar c = (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')
isDigitChar c = (c >= '0' && c <= '9')
isAlphaNumChar c = isAlphaChar c || isDigitChar c
isUcsChar c =
let n = ord c
in (0xA0 <= n && n <= 0xD7FF) ||
(0x20000 <= n && n <= 0x2FFFD) ||
(0x30000 <= n && n <= 0x3FFFD) ||
(0x40000 <= n && n <= 0x4FFFD) ||
(0x50000 <= n && n <= 0x5FFFD) ||
(0x60000 <= n && n <= 0x6FFFD) ||
(0x70000 <= n && n <= 0x7FFFD) ||
(0x80000 <= n && n <= 0x8FFFD) ||
(0x90000 <= n && n <= 0x9FFFD) ||
(0xA0000 <= n && n <= 0xAFFFD) ||
(0xB0000 <= n && n <= 0xBFFFD) ||
(0xC0000 <= n && n <= 0xCFFFD) ||
(0xD0000 <= n && n <= 0xDFFFD) ||
(0xE0000 <= n && n <= 0xEFFFD)
isIprivate c =
let n = ord c
in (0xE000 <= n && n <= 0xF8FF) ||
(0xF000 <= n && n <= 0xFFFD) ||
(0x100000 <= n && n <= 0x10FFFD)
isHexDigitChar c = isHexDigit c
isSchemeChar c = (isAlphaNumChar c) || (c `elem` "+-.")
alphaChar :: IRIParser st Char
alphaChar = satisfy isAlphaChar -- or: Parsec.letter ?
digitChar :: IRIParser st Char
digitChar = satisfy isDigitChar -- or: Parsec.digit ?
alphaNumChar :: IRIParser st Char
alphaNumChar = satisfy isAlphaNumChar
hexDigitChar :: IRIParser st Char
hexDigitChar = satisfy isHexDigitChar -- or: Parsec.hexDigit ?
ucsChar :: IRIParser st Char
ucsChar = satisfy isUcsChar
iprivate :: IRIParser st Char
iprivate = satisfy isIprivate
-- Additional parser combinators for common patterns
oneThenMany :: GenParser t s a -> GenParser t s a -> GenParser t s [a]
oneThenMany p1 pr =
do { a1 <- p1
; ar <- many pr
; return (a1:ar)
}
countMinMax :: Int -> Int -> GenParser t s a -> GenParser t s [a]
countMinMax m n p | m > 0 =
do { a1 <- p
; ar <- countMinMax (m-1) (n-1) p
; return (a1:ar)
}
countMinMax _ n _ | n <= 0 = return []
countMinMax _ n p = option [] $
do { a1 <- p
; ar <- countMinMax 0 (n-1) p
; return (a1:ar)
}
notMatching :: Show a => GenParser tok st a -> GenParser tok st ()
notMatching p = do { a <- try p ; unexpected (show a) } <|> return ()
------------------------------------------------------------
-- Reconstruct a IRI string
------------------------------------------------------------
--
-- |Turn a 'IRI' into a string.
--
-- Uses a supplied function to map the iuserinfo part of the IRI.
--
-- The Show instance for IRI uses a mapping that hides any password
-- that may be present in the IRI. Use this function with argument @id@
-- to preserve the password in the formatted output.
--
iriToString :: (String->String) -> IRI -> ShowS
iriToString iuserinfomap i@(IRI { iriScheme=scheme
, iriAuthority=authority
, iriPath=path
, iriQuery=query
, iriFragment=fragment
, prefixName=pname
, abbrevPath=aPath
}) = case iriType i of
Simple -> (aPath++)
Abbreviated -> (pname++) . (aPath++)
Full -> (scheme++) . (iriAuthToString iuserinfomap authority)
. (path++) . (query++) . (fragment++)
iriAuthToString :: (String->String) -> (Maybe IRIAuth) -> ShowS
iriAuthToString _ Nothing = id -- shows ""
iriAuthToString iuserinfomap
(Just IRIAuth { iriUserInfo = uinfo
, iriRegName = regname
, iriPort = port
} ) =
("//"++) . (if null uinfo then id else ((iuserinfomap uinfo)++))
. (regname++)
. (port++)
------------------------------------------------------------
-- Character classes
------------------------------------------------------------
-- | Returns 'True' if the character is allowed in a IRI.
--
isAllowedInIRI :: Char -> Bool
isAllowedInIRI c = isReserved c || isUnreserved c || c == '%' -- escape char
-- | Returns 'True' if the character is allowed unescaped in a IRI.
--
isUnescapedInIRI :: Char -> Bool
isUnescapedInIRI c = isReserved c || isUnreserved c
------------------------------------------------------------
-- Escape sequence handling
------------------------------------------------------------
-- |Escape character if supplied predicate is not satisfied,
-- otherwise return character as singleton string.
--
escapeIRIChar :: (Char->Bool) -> Char -> String
escapeIRIChar p c
| p c = [c]
| otherwise = '%' : myShowHex (ord c) ""
where
myShowHex :: Int -> ShowS
myShowHex n r = case showIntAtBase 16 (toChrHex) n r of
[] -> "00"
[c] -> ['0',c]
cs -> cs
toChrHex d
| d < 10 = chr (ord '0' + fromIntegral d)
| otherwise = chr (ord 'A' + fromIntegral (d - 10))
-- |Can be used to make a string valid for use in a IRI.
--
escapeIRIString
:: (Char->Bool) -- ^ a predicate which returns 'False'
-- if the character should be escaped
-> String -- ^ the string to process
-> String -- ^ the resulting IRI string
escapeIRIString p s = concatMap (escapeIRIChar p) s
-- |Turns all instances of escaped characters in the string back
-- into literal characters.
--
unEscapeString :: String -> String
unEscapeString [] = ""
unEscapeString ('%':x1:x2:s) | isHexDigit x1 && isHexDigit x2 =
chr (digitToInt x1 * 16 + digitToInt x2) : unEscapeString s
unEscapeString (c:s) = c : unEscapeString s
------------------------------------------------------------
-- Resolving a relative IRI relative to a base IRI
------------------------------------------------------------
-- |Returns a new 'IRI' which represents the value of the
-- first 'IRI' interpreted as relative to the second 'IRI'.
-- For example:
--
-- > "foo" `relativeTo` "http://bar.org/" = "http://bar.org/foo"
-- > "http:foo" `nonStrictRelativeTo` "http://bar.org/" = "http://bar.org/foo"
--
-- Algorithm from RFC3986 [3], section 5.2.2
--
nonStrictRelativeTo :: IRI -> IRI -> Maybe IRI
nonStrictRelativeTo ref base = relativeTo ref' base
where
ref' = if iriScheme ref == iriScheme base
then ref { iriScheme="" }
else ref
isDefined :: ( MonadPlus m, Eq (m a) ) => m a -> Bool
isDefined a = a /= mzero
-- |Compute an absolute 'IRI' for a supplied IRI
-- relative to a given base.
relativeTo :: IRI -> IRI -> Maybe IRI
relativeTo ref base
| isDefined ( iriScheme ref ) =
just_isegments ref
| isDefined ( iriAuthority ref ) =
just_isegments ref { iriScheme = iriScheme base }
| isDefined ( iriPath ref ) =
if (head (iriPath ref) == '/') then
just_isegments ref
{ iriScheme = iriScheme base
, iriAuthority = iriAuthority base
}
else
just_isegments ref
{ iriScheme = iriScheme base
, iriAuthority = iriAuthority base
, iriPath = mergePaths base ref
}
| isDefined ( iriQuery ref ) =
just_isegments ref
{ iriScheme = iriScheme base
, iriAuthority = iriAuthority base
, iriPath = iriPath base
}
| otherwise =
just_isegments ref
{ iriScheme = iriScheme base
, iriAuthority = iriAuthority base
, iriPath = iriPath base
, iriQuery = iriQuery base
}
where
just_isegments u =
Just $ u { iriPath = removeDotSegments (iriPath u) }
mergePaths b r
| isDefined (iriAuthority b) && null pb = '/':pr
| otherwise = dropLast pb ++ pr
where
pb = iriPath b
pr = iriPath r
dropLast = fst . splitLast -- reverse . dropWhile (/='/') . reverse
-- Remove dot isegments, but protect leading '/' character
removeDotSegments :: String -> String
removeDotSegments ('/':ps) = '/':elimDots ps []
removeDotSegments ps = elimDots ps []
-- Second arg accumulates isegments processed so far in reverse order
elimDots :: String -> [String] -> String
elimDots [] [] = ""
elimDots [] rs = concat (reverse rs)
elimDots ( '.':'/':ps) rs = elimDots ps rs
elimDots ( '.':[] ) rs = elimDots [] rs
elimDots ( '.':'.':'/':ps) rs = elimDots ps (dropHead rs)
elimDots ( '.':'.':[] ) rs = elimDots [] (dropHead rs)
elimDots ps rs = elimDots ps1 (r:rs)
where
(r,ps1) = nextSegment ps
-- Return tail of non-null list, otherwise return null list
dropHead :: [a] -> [a]
dropHead [] = []
dropHead (r:rs) = rs
-- Returns the next isegment and the rest of the path from a path string.
-- Each isegment ends with the next '/' or the end of string.
--
nextSegment :: String -> (String,String)
nextSegment ps =
case break (=='/') ps of
(r,'/':ps1) -> (r++"/",ps1)
(r,_) -> (r,[])
-- Split last (name) isegment from path, returning (path,name)
splitLast :: String -> (String,String)
splitLast path = (reverse revpath,reverse revname)
where
(revname,revpath) = break (=='/') $ reverse path
------------------------------------------------------------
-- Finding a IRI relative to a base IRI
------------------------------------------------------------
-- |Returns a new 'IRI' which represents the relative location of
-- the first 'IRI' with respect to the second 'IRI'. Thus, the
-- values supplied are expected to be absolute IRIs, and the result
-- returned may be a relative IRI.
--
-- Example:
--
-- > "http://example.com/Root/sub1/name2#frag"
-- > `relativeFrom` "http://example.com/Root/sub2/name2#frag"
-- > == "../sub1/name2#frag"
--
-- There is no single correct implementation of this function,
-- but any acceptable implementation must satisfy the following:
--
-- > (uabs `relativeFrom` ubase) `relativeTo` ubase == uabs
--
-- For any valid absolute IRI.
-- (cf. <http://lists.w3.org/Archives/Public/iri/2003Jan/0008.html>
-- <http://lists.w3.org/Archives/Public/iri/2003Jan/0005.html>)
--
relativeFrom :: IRI -> IRI -> IRI
relativeFrom uabs base
| diff iriScheme uabs base = uabs
| diff iriAuthority uabs base = uabs { iriScheme = "" }
| diff iriPath uabs base = uabs
{ iriScheme = ""
, iriAuthority = Nothing
, iriPath = relPathFrom (removeBodyDotSegments $ iriPath uabs)
(removeBodyDotSegments $ iriPath base)
}
| diff iriQuery uabs base = uabs
{ iriScheme = ""
, iriAuthority = Nothing
, iriPath = ""
}
| otherwise = uabs -- Always carry fragment from uabs
{ iriScheme = ""
, iriAuthority = Nothing
, iriPath = ""
, iriQuery = ""
}
where
diff :: Eq b => (a -> b) -> a -> a -> Bool
diff sel u1 u2 = sel u1 /= sel u2
-- Remove dot isegments except the final isegment
removeBodyDotSegments p = removeDotSegments p1 ++ p2
where
(p1,p2) = splitLast p
relPathFrom :: String -> String -> String
relPathFrom [] base = "/"
relPathFrom pabs [] = pabs
relPathFrom pabs base = -- Construct a relative path isegments
if sa1 == sb1 -- if the paths share a leading isegment
then if (sa1 == "/") -- other than a leading '/'
then if (sa2 == sb2)
then relPathFrom1 ra2 rb2
else pabs
else relPathFrom1 ra1 rb1
else pabs
where
(sa1,ra1) = nextSegment pabs
(sb1,rb1) = nextSegment base
(sa2,ra2) = nextSegment ra1
(sb2,rb2) = nextSegment rb1
-- relPathFrom1 strips off trailing names from the supplied paths,
-- and calls difPathFrom to find the relative path from base to
-- target
relPathFrom1 :: String -> String -> String
relPathFrom1 pabs base = relName
where
(sa,na) = splitLast pabs
(sb,nb) = splitLast base
rp = relSegsFrom sa sb
relName = if null rp then
if (na == nb) then ""
else if protect na then "./"++na
else na
else
rp++na
-- Precede name with some path if it is null or contains a ':'
protect na = null na || ':' `elem` na
-- relSegsFrom discards any common leading isegments from both paths,
-- then invokes difSegsFrom to calculate a relative path from the end
-- of the base path to the end of the target path.
-- The final name is handled separately, so this deals only with
-- "directory" segtments.
--
relSegsFrom :: String -> String -> String
relSegsFrom [] [] = "" -- paths are identical
relSegsFrom sabs base =
if sa1 == sb1
then relSegsFrom ra1 rb1
else difSegsFrom sabs base
where
(sa1,ra1) = nextSegment sabs
(sb1,rb1) = nextSegment base
-- difSegsFrom calculates a path difference from base to target,
-- not including the final name at the end of the path
-- (i.e. results always ends with '/')
--
-- This function operates under the invariant that the supplied
-- value of sabs is the desired path relative to the beginning of
-- base. Thus, when base is empty, the desired path has been found.
--
difSegsFrom :: String -> String -> String
difSegsFrom sabs "" = sabs
difSegsFrom sabs base = difSegsFrom ("../"++sabs) (snd $ nextSegment base)
------------------------------------------------------------
-- Other normalization functions
------------------------------------------------------------
-- |Case normalization; cf. RFC3986 section 6.2.2.1
-- NOTE: authority case normalization is not performed
--
normalizeCase :: String -> String
normalizeCase iristr = ncScheme iristr
where
ncScheme (':':cs) = ':':ncEscape cs
ncScheme (c:cs) | isSchemeChar c = toLower c:ncScheme cs
ncScheme _ = ncEscape iristr -- no scheme present
ncEscape ('%':h1:h2:cs) = '%':toUpper h1:toUpper h2:ncEscape cs
ncEscape (c:cs) = c:ncEscape cs
ncEscape [] = []
-- |Encoding normalization; cf. RFC3986 section 6.2.2.2
--
normalizeEscape :: String -> String
normalizeEscape ('%':h1:h2:cs)
| isHexDigit h1 && isHexDigit h2 && isUnreserved escval =
escval:normalizeEscape cs
where
escval = chr (digitToInt h1*16+digitToInt h2)
normalizeEscape (c:cs) = c:normalizeEscape cs
normalizeEscape [] = []
-- |Path isegment normalization; cf. RFC3986 section 6.2.2.4
--
normalizePathSegments :: String -> String
normalizePathSegments iristr = normstr jiri
where
jiri = parseIRI iristr
normstr Nothing = iristr
normstr (Just u) = show (normiri u)
normiri u = u { iriPath = removeDotSegments (iriPath u) }
--------------------------------------------------------------------------------
--
-- Copyright (c) 2004, G. KLYNE. All rights reserved.
-- Distributed as free software under the following license.
--
-- Redistribution and use in source and binary forms, with or without
-- modification, are permitted provided that the following conditions
-- are met:
--
-- - Redistributions of source code must retain the above copyright notice,
-- this list of conditions and the following disclaimer.
--
-- - Redistributions in binary form must reproduce the above copyright
-- notice, this list of conditions and the following disclaimer in the
-- documentation and/or other materials provided with the distribution.
--
-- - Neither name of the copyright holders nor the names of its
-- contributors may be used to endorse or promote products derived from
-- this software without specific prior written permission.
--
-- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND THE CONTRIBUTORS
-- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-- HOLDERS OR THE CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-- INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-- BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
-- OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-- ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
-- TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-- USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--
--------------------------------------------------------------------------------