{-|
== Suffix array and LCP array

A suffix array is the sorted array of all suffixes of a string, each suffix represented by its
start index. An LCP (longest common prefix) array is an array of the lengths of the longest common
prefixes of consecutive suffixes in the suffix array.
A suffix array, sometimes with its LCP array, can be used to perform tasks like finding patterns
in the string or calculating certain properties of the string.
Other suffix structures, such as suffix trees and suffix automata may serve as alternates to suffix
arrays.

This implementation constructs the suffix array and LCP array from a given string indexing function,
with elements of type Int. It takes O(n log n), which is usually fast enough. O(n) algorithms to
construct suffix arrays exist.

Sources:

* Udi Manber and Gene Myers, "Suffix Arrays: A New Method for On-Line String Searches", 1990
  https://dl.acm.org/doi/10.5555/320176.320218
* Kasai et al., "Linear-Time Longest-Common-Prefix Computation in Suffix Arrays and Its
  Applications", 2001
  https://link.springer.com/chapter/10.1007/3-540-48194-X_17
* https://sites.google.com/site/indy256/algo/suffix_array

-}

{-
Implementation notes:
* The construction is implemented using a prefix doubling algorithm, similar to the algorithm due to
  Manber and Myers. Each step sorts substrings of lengths in successive powers of two.
* The last character is considered smaller than other equal characters. This is one way, and an easy
  one, to allow shorter substrings to appear first in the suffix array.
* At the step sorting substrings of length 2 * m, we go over the substrings in the sorted order of
  their right halves and put them into positons corresponding to the ranks of their left halves.
* The LCP array is constructed from the suffix array using Kasai's algorithm.
-}

module SuffixArray
    ( Chr
    , SuffixId
    , buildSufA
    , buildSufAL
    ) where

import Control.Monad
import Control.Monad.ST
import Data.Array.ST
import Data.Array.Unboxed
import Data.Function
import Data.List
import Data.Ord

import Sort ( sortUABy, countingSortUA )

type Chr = Int
type SuffixId = Int

-- | Builds a suffix array and LCP array. n is the length of the string. at is a 0-based indexing
-- function into the string. Characters must be in [0..b-1]. Faster than buildSufAL unless b is too
-- large. O(b + n log n).
buildSufA :: Chr -> Int -> (Int -> Chr) -> (UArray Int SuffixId, UArray Int Int)
buildSufA :: Chr -> Chr -> (Chr -> Chr) -> (UArray Chr Chr, UArray Chr Chr)
buildSufA Chr
b = ((Chr -> Chr) -> UArray Chr Chr -> UArray Chr Chr)
-> Chr -> (Chr -> Chr) -> (UArray Chr Chr, UArray Chr Chr)
buildSufA_ (Chr -> (Chr -> Chr) -> UArray Chr Chr -> UArray Chr Chr
forall e.
(IArray UArray e, forall s. MArray (STUArray s) e (ST s)) =>
Chr -> (e -> Chr) -> UArray Chr e -> UArray Chr e
countingSortUA (Chr
b Chr -> Chr -> Chr
forall a. Num a => a -> a -> a
+ Chr
1))

-- | Builds a suffix array and LCP array. n is the length of the string. at is a 0-based indexing
-- function into the string. Intended for large alphabets. O(n log n).
buildSufAL :: Int -> (Int -> Chr) -> (UArray Int SuffixId, UArray Int Int)
buildSufAL :: Chr -> (Chr -> Chr) -> (UArray Chr Chr, UArray Chr Chr)
buildSufAL = ((Chr -> Chr) -> UArray Chr Chr -> UArray Chr Chr)
-> Chr -> (Chr -> Chr) -> (UArray Chr Chr, UArray Chr Chr)
buildSufA_ ((Chr -> Chr -> Ordering) -> UArray Chr Chr -> UArray Chr Chr
forall e.
(forall s. MArray (STUArray s) e (ST s), IArray UArray e) =>
(e -> e -> Ordering) -> UArray Chr e -> UArray Chr e
sortUABy ((Chr -> Chr -> Ordering) -> UArray Chr Chr -> UArray Chr Chr)
-> ((Chr -> Chr) -> Chr -> Chr -> Ordering)
-> (Chr -> Chr)
-> UArray Chr Chr
-> UArray Chr Chr
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Chr -> Chr) -> Chr -> Chr -> Ordering
forall a b. Ord a => (b -> a) -> b -> b -> Ordering
comparing)

buildSufA_ :: ((Int -> Chr) -> UArray Int SuffixId -> UArray Int SuffixId)
           -> Int -> (Int -> Chr) -> (UArray Int SuffixId, UArray Int Int)
buildSufA_ :: ((Chr -> Chr) -> UArray Chr Chr -> UArray Chr Chr)
-> Chr -> (Chr -> Chr) -> (UArray Chr Chr, UArray Chr Chr)
buildSufA_ (Chr -> Chr) -> UArray Chr Chr -> UArray Chr Chr
sortf Chr
n Chr -> Chr
at = (UArray Chr Chr
p, UArray Chr Chr
lcp) where
    (UArray Chr Chr
p, UArray Chr Chr
r) = Chr
-> UArray Chr Chr
-> UArray Chr Chr
-> (UArray Chr Chr, UArray Chr Chr)
sufADoubling Chr
n UArray Chr Chr
p0 UArray Chr Chr
r0
    lcp :: UArray Chr Chr
lcp = (Chr -> Chr) -> UArray Chr Chr -> UArray Chr Chr -> UArray Chr Chr
kasai Chr -> Chr
at UArray Chr Chr
p UArray Chr Chr
r
    p0 :: UArray Chr Chr
p0 = (Chr -> Chr) -> UArray Chr Chr -> UArray Chr Chr
sortf Chr -> Chr
at' ((Chr, Chr) -> [Chr] -> UArray Chr Chr
forall (a :: * -> * -> *) e i.
(IArray a e, Ix i) =>
(i, i) -> [e] -> a i e
listArray (Chr
0, Chr
nChr -> Chr -> Chr
forall a. Num a => a -> a -> a
-Chr
1) [Chr
0..])
    r0 :: UArray Chr Chr
r0 = UArray Chr Chr -> (Chr -> Chr -> Bool) -> UArray Chr Chr
rankSufA UArray Chr Chr
p0 (Chr -> Chr -> Bool
forall a. Eq a => a -> a -> Bool
(==) (Chr -> Chr -> Bool) -> (Chr -> Chr) -> Chr -> Chr -> Bool
forall b c a. (b -> b -> c) -> (a -> b) -> a -> a -> c
`on` Chr -> Chr
at')
    lastc :: Chr
lastc = Chr -> Chr
at (Chr
n Chr -> Chr -> Chr
forall a. Num a => a -> a -> a
- Chr
1)
    at' :: Chr -> Chr
at' Chr
i | Chr
i Chr -> Chr -> Bool
forall a. Eq a => a -> a -> Bool
== Chr
n Chr -> Chr -> Chr
forall a. Num a => a -> a -> a
- Chr
1 = Chr
lastc
    at' Chr
i = let c :: Chr
c = Chr -> Chr
at Chr
i in Chr
c Chr -> Chr -> Chr
forall a. Num a => a -> a -> a
+ Bool -> Chr
forall a. Enum a => a -> Chr
fromEnum (Chr
c Chr -> Chr -> Bool
forall a. Ord a => a -> a -> Bool
>= Chr
lastc)

sufADoubling :: Int -> UArray Int SuffixId -> UArray SuffixId Int -> (UArray Int SuffixId, UArray Int Int)
sufADoubling :: Chr
-> UArray Chr Chr
-> UArray Chr Chr
-> (UArray Chr Chr, UArray Chr Chr)
sufADoubling Chr
n UArray Chr Chr
p0 UArray Chr Chr
r0 = ((UArray Chr Chr, UArray Chr Chr)
 -> Chr -> (UArray Chr Chr, UArray Chr Chr))
-> (UArray Chr Chr, UArray Chr Chr)
-> [Chr]
-> (UArray Chr Chr, UArray Chr Chr)
forall (t :: * -> *) b a.
Foldable t =>
(b -> a -> b) -> b -> t a -> b
foldl' ((UArray Chr Chr
 -> UArray Chr Chr -> Chr -> (UArray Chr Chr, UArray Chr Chr))
-> (UArray Chr Chr, UArray Chr Chr)
-> Chr
-> (UArray Chr Chr, UArray Chr Chr)
forall a b c. (a -> b -> c) -> (a, b) -> c
uncurry UArray Chr Chr
-> UArray Chr Chr -> Chr -> (UArray Chr Chr, UArray Chr Chr)
step) (UArray Chr Chr
p0, UArray Chr Chr
r0) ([Chr] -> (UArray Chr Chr, UArray Chr Chr))
-> [Chr] -> (UArray Chr Chr, UArray Chr Chr)
forall a b. (a -> b) -> a -> b
$ (Chr -> Bool) -> [Chr] -> [Chr]
forall a. (a -> Bool) -> [a] -> [a]
takeWhile (Chr -> Chr -> Bool
forall a. Ord a => a -> a -> Bool
<Chr
n) ([Chr] -> [Chr]) -> [Chr] -> [Chr]
forall a b. (a -> b) -> a -> b
$ (Chr -> Chr) -> Chr -> [Chr]
forall a. (a -> a) -> a -> [a]
iterate (Chr -> Chr -> Chr
forall a. Num a => a -> a -> a
*Chr
2) Chr
1 where
    step :: UArray Int SuffixId -> UArray SuffixId Int -> Int -> (UArray Int SuffixId, UArray SuffixId Int)
    step :: UArray Chr Chr
-> UArray Chr Chr -> Chr -> (UArray Chr Chr, UArray Chr Chr)
step UArray Chr Chr
p UArray Chr Chr
r Chr
m = UArray Chr Chr
r UArray Chr Chr
-> (UArray Chr Chr, UArray Chr Chr)
-> (UArray Chr Chr, UArray Chr Chr)
`seq` (UArray Chr Chr
p', UArray Chr Chr
r') where
        p' :: UArray Chr Chr
p' = (forall s. ST s (STUArray s Chr Chr)) -> UArray Chr Chr
forall i e. (forall s. ST s (STUArray s i e)) -> UArray i e
runSTUArray ((forall s. ST s (STUArray s Chr Chr)) -> UArray Chr Chr)
-> (forall s. ST s (STUArray s Chr Chr)) -> UArray Chr Chr
forall a b. (a -> b) -> a -> b
$ do
            STUArray s Chr Chr
pos <- (Chr, Chr) -> [Chr] -> ST s (STUArray s Chr Chr)
forall (a :: * -> * -> *) e (m :: * -> *) i.
(MArray a e m, Ix i) =>
(i, i) -> [e] -> m (a i e)
newListArray (Chr
0, Chr
nChr -> Chr -> Chr
forall a. Num a => a -> a -> a
-Chr
1) [Chr
0..] :: ST s (STUArray s Int Int)
            STUArray s Chr Chr
pa <- UArray Chr Chr -> ST s (STUArray s Chr Chr)
forall i (a :: * -> * -> *) e (b :: * -> * -> *) (m :: * -> *).
(Ix i, IArray a e, MArray b e m) =>
a i e -> m (b i e)
thaw UArray Chr Chr
p
            [Chr] -> (Chr -> ST s ()) -> ST s ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
t a -> (a -> m b) -> m ()
forM_ [Chr
i Chr -> Chr -> Chr
forall a. Num a => a -> a -> a
- Chr
m | Chr
i <- UArray Chr Chr -> [Chr]
forall (a :: * -> * -> *) e i. (IArray a e, Ix i) => a i e -> [e]
elems UArray Chr Chr
p, Chr
i Chr -> Chr -> Bool
forall a. Ord a => a -> a -> Bool
>= Chr
m] ((Chr -> ST s ()) -> ST s ()) -> (Chr -> ST s ()) -> ST s ()
forall a b. (a -> b) -> a -> b
$ \Chr
i -> do
                Chr
x <- STUArray s Chr Chr -> Chr -> ST s Chr
forall (a :: * -> * -> *) e (m :: * -> *) i.
(MArray a e m, Ix i) =>
a i e -> i -> m e
readArray STUArray s Chr Chr
pos (UArray Chr Chr
rUArray Chr Chr -> Chr -> Chr
forall (a :: * -> * -> *) e i.
(IArray a e, Ix i) =>
a i e -> i -> e
!Chr
i)
                STUArray s Chr Chr -> Chr -> Chr -> ST s ()
forall (a :: * -> * -> *) e (m :: * -> *) i.
(MArray a e m, Ix i) =>
a i e -> i -> e -> m ()
writeArray STUArray s Chr Chr
pos (UArray Chr Chr
rUArray Chr Chr -> Chr -> Chr
forall (a :: * -> * -> *) e i.
(IArray a e, Ix i) =>
a i e -> i -> e
!Chr
i) (Chr
x Chr -> Chr -> Chr
forall a. Num a => a -> a -> a
+ Chr
1)
                STUArray s Chr Chr -> Chr -> Chr -> ST s ()
forall (a :: * -> * -> *) e (m :: * -> *) i.
(MArray a e m, Ix i) =>
a i e -> i -> e -> m ()
writeArray STUArray s Chr Chr
pa Chr
x Chr
i
            STUArray s Chr Chr -> ST s (STUArray s Chr Chr)
forall (f :: * -> *) a. Applicative f => a -> f a
pure STUArray s Chr Chr
pa
        r' :: UArray Chr Chr
r' = UArray Chr Chr -> (Chr -> Chr -> Bool) -> UArray Chr Chr
rankSufA UArray Chr Chr
p' ((Chr -> Chr -> Bool) -> UArray Chr Chr)
-> (Chr -> Chr -> Bool) -> UArray Chr Chr
forall a b. (a -> b) -> a -> b
$ \Chr
i Chr
j -> UArray Chr Chr
rUArray Chr Chr -> Chr -> Chr
forall (a :: * -> * -> *) e i.
(IArray a e, Ix i) =>
a i e -> i -> e
!Chr
i Chr -> Chr -> Bool
forall a. Eq a => a -> a -> Bool
== UArray Chr Chr
rUArray Chr Chr -> Chr -> Chr
forall (a :: * -> * -> *) e i.
(IArray a e, Ix i) =>
a i e -> i -> e
!Chr
j Bool -> Bool -> Bool
&& Chr
i Chr -> Chr -> Chr
forall a. Num a => a -> a -> a
+ Chr
m Chr -> Chr -> Bool
forall a. Ord a => a -> a -> Bool
< Chr
n Bool -> Bool -> Bool
&& UArray Chr Chr
rUArray Chr Chr -> Chr -> Chr
forall (a :: * -> * -> *) e i.
(IArray a e, Ix i) =>
a i e -> i -> e
!(Chr
i Chr -> Chr -> Chr
forall a. Num a => a -> a -> a
+ Chr
m) Chr -> Chr -> Bool
forall a. Eq a => a -> a -> Bool
== UArray Chr Chr
rUArray Chr Chr -> Chr -> Chr
forall (a :: * -> * -> *) e i.
(IArray a e, Ix i) =>
a i e -> i -> e
!(Chr
j Chr -> Chr -> Chr
forall a. Num a => a -> a -> a
+ Chr
m)

rankSufA :: UArray Int SuffixId -> (SuffixId -> SuffixId -> Bool) -> UArray SuffixId Int
rankSufA :: UArray Chr Chr -> (Chr -> Chr -> Bool) -> UArray Chr Chr
rankSufA UArray Chr Chr
p Chr -> Chr -> Bool
eq = (Chr, Chr) -> [(Chr, Chr)] -> UArray Chr Chr
forall (a :: * -> * -> *) e i.
(IArray a e, Ix i) =>
(i, i) -> [(i, e)] -> a i e
array (Chr
0, Chr
n') ([(Chr, Chr)] -> UArray Chr Chr) -> [(Chr, Chr)] -> UArray Chr Chr
forall a b. (a -> b) -> a -> b
$ (UArray Chr Chr
pUArray Chr Chr -> Chr -> Chr
forall (a :: * -> * -> *) e i.
(IArray a e, Ix i) =>
a i e -> i -> e
!Chr
0, Chr
0) (Chr, Chr) -> [(Chr, Chr)] -> [(Chr, Chr)]
forall a. a -> [a] -> [a]
: (Chr -> (Chr -> [(Chr, Chr)]) -> Chr -> [(Chr, Chr)])
-> (Chr -> [(Chr, Chr)]) -> [Chr] -> Chr -> [(Chr, Chr)]
forall (t :: * -> *) a b.
Foldable t =>
(a -> b -> b) -> b -> t a -> b
foldr Chr -> (Chr -> [(Chr, Chr)]) -> Chr -> [(Chr, Chr)]
f ([(Chr, Chr)] -> Chr -> [(Chr, Chr)]
forall a b. a -> b -> a
const []) [Chr
1..Chr
n'] Chr
0 where
    (Chr
0, Chr
n') = UArray Chr Chr -> (Chr, Chr)
forall (a :: * -> * -> *) e i.
(IArray a e, Ix i) =>
a i e -> (i, i)
bounds UArray Chr Chr
p
    f :: Chr -> (Chr -> [(Chr, Chr)]) -> Chr -> [(Chr, Chr)]
f Chr
i Chr -> [(Chr, Chr)]
k Chr
prv = let cur :: Chr
cur = if Chr -> Chr -> Bool
eq (UArray Chr Chr
pUArray Chr Chr -> Chr -> Chr
forall (a :: * -> * -> *) e i.
(IArray a e, Ix i) =>
a i e -> i -> e
!(Chr
i Chr -> Chr -> Chr
forall a. Num a => a -> a -> a
- Chr
1)) (UArray Chr Chr
pUArray Chr Chr -> Chr -> Chr
forall (a :: * -> * -> *) e i.
(IArray a e, Ix i) =>
a i e -> i -> e
!Chr
i) then Chr
prv else Chr
i
                in Chr
cur Chr -> [(Chr, Chr)] -> [(Chr, Chr)]
`seq` (UArray Chr Chr
pUArray Chr Chr -> Chr -> Chr
forall (a :: * -> * -> *) e i.
(IArray a e, Ix i) =>
a i e -> i -> e
!Chr
i, Chr
cur) (Chr, Chr) -> [(Chr, Chr)] -> [(Chr, Chr)]
forall a. a -> [a] -> [a]
: Chr -> [(Chr, Chr)]
k Chr
cur
{-# INLINE rankSufA #-}

kasai :: (Int -> Chr) -> UArray Int SuffixId -> UArray SuffixId Int -> UArray Int Int
kasai :: (Chr -> Chr) -> UArray Chr Chr -> UArray Chr Chr -> UArray Chr Chr
kasai Chr -> Chr
at UArray Chr Chr
p UArray Chr Chr
r = (Chr, Chr) -> [(Chr, Chr)] -> UArray Chr Chr
forall (a :: * -> * -> *) e i.
(IArray a e, Ix i) =>
(i, i) -> [(i, e)] -> a i e
array (Chr
0, Chr
n'Chr -> Chr -> Chr
forall a. Num a => a -> a -> a
-Chr
1) ([(Chr, Chr)] -> UArray Chr Chr) -> [(Chr, Chr)] -> UArray Chr Chr
forall a b. (a -> b) -> a -> b
$ (Chr -> (Chr -> [(Chr, Chr)]) -> Chr -> [(Chr, Chr)])
-> (Chr -> [(Chr, Chr)]) -> [Chr] -> Chr -> [(Chr, Chr)]
forall (t :: * -> *) a b.
Foldable t =>
(a -> b -> b) -> b -> t a -> b
foldr Chr -> (Chr -> [(Chr, Chr)]) -> Chr -> [(Chr, Chr)]
f ([(Chr, Chr)] -> Chr -> [(Chr, Chr)]
forall a b. a -> b -> a
const []) [Chr
0..Chr
n'] Chr
0 where
    (Chr
0, Chr
n') = UArray Chr Chr -> (Chr, Chr)
forall (a :: * -> * -> *) e i.
(IArray a e, Ix i) =>
a i e -> (i, i)
bounds UArray Chr Chr
p
    f :: Chr -> (Chr -> [(Chr, Chr)]) -> Chr -> [(Chr, Chr)]
f Chr
i Chr -> [(Chr, Chr)]
k Chr
_ | UArray Chr Chr
rUArray Chr Chr -> Chr -> Chr
forall (a :: * -> * -> *) e i.
(IArray a e, Ix i) =>
a i e -> i -> e
!Chr
i Chr -> Chr -> Bool
forall a. Eq a => a -> a -> Bool
== Chr
n' = Chr -> [(Chr, Chr)]
k Chr
0
    f Chr
i Chr -> [(Chr, Chr)]
k Chr
x = (UArray Chr Chr
rUArray Chr Chr -> Chr -> Chr
forall (a :: * -> * -> *) e i.
(IArray a e, Ix i) =>
a i e -> i -> e
!Chr
i, Chr
x') (Chr, Chr) -> [(Chr, Chr)] -> [(Chr, Chr)]
forall a. a -> [a] -> [a]
: Chr -> [(Chr, Chr)]
k (Chr -> Chr -> Chr
forall a. Ord a => a -> a -> a
max Chr
0 (Chr
x' Chr -> Chr -> Chr
forall a. Num a => a -> a -> a
- Chr
1)) where
        j :: Chr
j = UArray Chr Chr
pUArray Chr Chr -> Chr -> Chr
forall (a :: * -> * -> *) e i.
(IArray a e, Ix i) =>
a i e -> i -> e
!(UArray Chr Chr
rUArray Chr Chr -> Chr -> Chr
forall (a :: * -> * -> *) e i.
(IArray a e, Ix i) =>
a i e -> i -> e
!Chr
i Chr -> Chr -> Chr
forall a. Num a => a -> a -> a
+ Chr
1)
        x' :: Chr
x' = (Chr -> Bool) -> (Chr -> Chr) -> Chr -> Chr
forall a. (a -> Bool) -> (a -> a) -> a -> a
until (\Chr
x'' -> Chr
i Chr -> Chr -> Chr
forall a. Num a => a -> a -> a
+ Chr
x'' Chr -> Chr -> Bool
forall a. Ord a => a -> a -> Bool
> Chr
n' Bool -> Bool -> Bool
|| Chr
j Chr -> Chr -> Chr
forall a. Num a => a -> a -> a
+ Chr
x'' Chr -> Chr -> Bool
forall a. Ord a => a -> a -> Bool
> Chr
n' Bool -> Bool -> Bool
|| Chr -> Chr
at (Chr
i Chr -> Chr -> Chr
forall a. Num a => a -> a -> a
+ Chr
x'') Chr -> Chr -> Bool
forall a. Eq a => a -> a -> Bool
/= Chr -> Chr
at (Chr
j Chr -> Chr -> Chr
forall a. Num a => a -> a -> a
+ Chr
x'')) (Chr -> Chr -> Chr
forall a. Num a => a -> a -> a
+Chr
1) Chr
x