Skip to content

Commit c7829e1

Browse files
authored
Merge pull request kodecocodes#236 from dontfollowmeimcrazy/kmp-z-algorithms
Knuth-Morris-Pratt and Z-Algorithm
2 parents 0087e28 + 5423763 commit c7829e1

File tree

11 files changed

+738
-0
lines changed

11 files changed

+738
-0
lines changed
Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
//: Playground - noun: a place where people can play
2+
3+
4+
func ZetaAlgorithm(ptnr: String) -> [Int]? {
5+
6+
let pattern = Array(ptnr.characters)
7+
let patternLength: Int = pattern.count
8+
9+
guard patternLength > 0 else {
10+
return nil
11+
}
12+
13+
var zeta: [Int] = [Int](repeating: 0, count: patternLength)
14+
15+
var left: Int = 0
16+
var right: Int = 0
17+
var k_1: Int = 0
18+
var betaLength: Int = 0
19+
var textIndex: Int = 0
20+
var patternIndex: Int = 0
21+
22+
for k in 1 ..< patternLength {
23+
if k > right {
24+
patternIndex = 0
25+
26+
while k + patternIndex < patternLength &&
27+
pattern[k + patternIndex] == pattern[patternIndex] {
28+
patternIndex = patternIndex + 1
29+
}
30+
31+
zeta[k] = patternIndex
32+
33+
if zeta[k] > 0 {
34+
left = k
35+
right = k + zeta[k] - 1
36+
}
37+
} else {
38+
k_1 = k - left + 1
39+
betaLength = right - k + 1
40+
41+
if zeta[k_1 - 1] < betaLength {
42+
zeta[k] = zeta[k_1 - 1]
43+
} else if zeta[k_1 - 1] >= betaLength {
44+
textIndex = betaLength
45+
patternIndex = right + 1
46+
47+
while patternIndex < patternLength && pattern[textIndex] == pattern[patternIndex] {
48+
textIndex = textIndex + 1
49+
patternIndex = patternIndex + 1
50+
}
51+
zeta[k] = patternIndex - k
52+
left = k
53+
right = patternIndex - 1
54+
}
55+
}
56+
}
57+
return zeta
58+
}
59+
60+
extension String {
61+
62+
func indexesOf(ptnr: String) -> [Int]? {
63+
64+
let text = Array(self.characters)
65+
let pattern = Array(ptnr.characters)
66+
67+
let textLength: Int = text.count
68+
let patternLength: Int = pattern.count
69+
70+
guard patternLength > 0 else {
71+
return nil
72+
}
73+
74+
var suffixPrefix: [Int] = [Int](repeating: 0, count: patternLength)
75+
var textIndex: Int = 0
76+
var patternIndex: Int = 0
77+
var indexes: [Int] = [Int]()
78+
79+
/* Pre-processing stage: computing the table for the shifts (through Z-Algorithm) */
80+
let zeta = ZetaAlgorithm(ptnr: ptnr)
81+
82+
for patternIndex in (1 ..< patternLength).reversed() {
83+
textIndex = patternIndex + zeta![patternIndex] - 1
84+
suffixPrefix[textIndex] = zeta![patternIndex]
85+
}
86+
87+
/* Search stage: scanning the text for pattern matching */
88+
textIndex = 0
89+
patternIndex = 0
90+
91+
while textIndex + (patternLength - patternIndex - 1) < textLength {
92+
93+
while patternIndex < patternLength && text[textIndex] == pattern[patternIndex] {
94+
textIndex = textIndex + 1
95+
patternIndex = patternIndex + 1
96+
}
97+
98+
if patternIndex == patternLength {
99+
indexes.append(textIndex - patternIndex)
100+
}
101+
102+
if patternIndex == 0 {
103+
textIndex = textIndex + 1
104+
} else {
105+
patternIndex = suffixPrefix[patternIndex - 1]
106+
}
107+
}
108+
109+
guard !indexes.isEmpty else {
110+
return nil
111+
}
112+
return indexes
113+
}
114+
}
115+
116+
/* Examples */
117+
118+
let dna = "ACCCGGTTTTAAAGAACCACCATAAGATATAGACAGATATAGGACAGATATAGAGACAAAACCCCATACCCCAATATTTTTTTGGGGAGAAAAACACCACAGATAGATACACAGACTACACGAGATACGACATACAGCAGCATAACGACAACAGCAGATAGACGATCATAACAGCAATCAGACCGAGCGCAGCAGCTTTTAAGCACCAGCCCCACAAAAAACGACAATFATCATCATATACAGACGACGACACGACATATCACACGACAGCATA"
119+
dna.indexesOf(ptnr: "CATA") // [20, 64, 130, 140, 166, 234, 255, 270]
120+
121+
let concert = "🎼🎹🎹🎸🎸🎻🎻🎷🎺🎤👏👏👏"
122+
concert.indexesOf(ptnr: "🎻🎷") // [6]
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
2+
<playground version='5.0' target-platform='osx'>
3+
<timeline fileName='timeline.xctimeline'/>
4+
</playground>

Knuth-Morris-Pratt/KnuthMorrisPratt.playground/playground.xcworkspace/contents.xcworkspacedata

Lines changed: 7 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
/* Knuth-Morris-Pratt algorithm for pattern/string matching
2+
3+
The code is based on the book:
4+
"Algorithms on String, Trees and Sequences: Computer Science and Computational Biology"
5+
by Dan Gusfield
6+
Cambridge University Press, 1997
7+
*/
8+
9+
import Foundation
10+
11+
extension String {
12+
13+
func indexesOf(ptnr: String) -> [Int]? {
14+
15+
let text = Array(self.characters)
16+
let pattern = Array(ptnr.characters)
17+
18+
let textLength: Int = text.count
19+
let patternLength: Int = pattern.count
20+
21+
guard patternLength > 0 else {
22+
return nil
23+
}
24+
25+
var suffixPrefix: [Int] = [Int](repeating: 0, count: patternLength)
26+
var textIndex: Int = 0
27+
var patternIndex: Int = 0
28+
var indexes: [Int] = [Int]()
29+
30+
/* Pre-processing stage: computing the table for the shifts (through Z-Algorithm) */
31+
let zeta = ZetaAlgorithm(ptnr: ptnr)
32+
33+
for patternIndex in (1 ..< patternLength).reversed() {
34+
textIndex = patternIndex + zeta![patternIndex] - 1
35+
suffixPrefix[textIndex] = zeta![patternIndex]
36+
}
37+
38+
/* Search stage: scanning the text for pattern matching */
39+
textIndex = 0
40+
patternIndex = 0
41+
42+
while textIndex + (patternLength - patternIndex - 1) < textLength {
43+
44+
while patternIndex < patternLength && text[textIndex] == pattern[patternIndex] {
45+
textIndex = textIndex + 1
46+
patternIndex = patternIndex + 1
47+
}
48+
49+
if patternIndex == patternLength {
50+
indexes.append(textIndex - patternIndex)
51+
}
52+
53+
if patternIndex == 0 {
54+
textIndex = textIndex + 1
55+
} else {
56+
patternIndex = suffixPrefix[patternIndex - 1]
57+
}
58+
}
59+
60+
guard !indexes.isEmpty else {
61+
return nil
62+
}
63+
return indexes
64+
}
65+
}

Knuth-Morris-Pratt/README.markdown

Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
# Knuth-Morris-Pratt String Search
2+
3+
Goal: Write a linear-time string matching algorithm in Swift that returns the indexes of all the occurrencies of a given pattern.
4+
5+
In other words, we want to implement an `indexesOf(pattern: String)` extension on `String` that returns an array `[Int]` of integers, representing all occurrences' indexes of the search pattern, or `nil` if the pattern could not be found inside the string.
6+
7+
For example:
8+
9+
```swift
10+
let dna = "ACCCGGTTTTAAAGAACCACCATAAGATATAGACAGATATAGGACAGATATAGAGACAAAACCCCATACCCCAATATTTTTTTGGGGAGAAAAACACCACAGATAGATACACAGACTACACGAGATACGACATACAGCAGCATAACGACAACAGCAGATAGACGATCATAACAGCAATCAGACCGAGCGCAGCAGCTTTTAAGCACCAGCCCCACAAAAAACGACAATFATCATCATATACAGACGACGACACGACATATCACACGACAGCATA"
11+
dna.indexesOf(ptnr: "CATA") // Output: [20, 64, 130, 140, 166, 234, 255, 270]
12+
13+
let concert = "🎼🎹🎹🎸🎸🎻🎻🎷🎺🎤👏👏👏"
14+
concert.indexesOf(ptnr: "🎻🎷") // Output: [6]
15+
```
16+
17+
The [Knuth-Morris-Pratt algorithm](https://en.wikipedia.org/wiki/Knuth–Morris–Pratt_algorithm) is considered one of the best algorithms for solving the pattern matching problem. Although in practice [Boyer-Moore](../Boyer-Moore/) is usually preferred, the algorithm that we will introduce is simpler, and has the same (linear) running time.
18+
19+
The idea behind the algorithm is not too different from the [naive string search](../Brute-Force String Search/) procedure. As it, Knuth-Morris-Pratt aligns the text with the pattern and goes with character comparisons from left to right. But, instead of making a shift of one character when a mismatch occurs, it uses a more intelligent way to move the pattern along the text. In fact, the algorithm features a pattern pre-processing stage where it acquires all the informations that will make the algorithm skip redundant comparisons, resulting in larger shifts.
20+
21+
The pre-processing stage produces an array (called `suffixPrefix` in the code) of integers in which every element `suffixPrefix[i]` records the length of the longest proper suffix of `P[0...i]` (where `P` is the pattern) that matches a prefix of `P`. In other words, `suffixPrefix[i]` is the longest proper substring of `P` that ends at position `i` and that is a prefix of `P`. Just a quick example. Consider `P = "abadfryaabsabadffg"`, then `suffixPrefix[4] = 0`, `suffixPrefix[9] = 2`, `suffixPrefix[14] = 4`.
22+
There are different ways to obtain the values of `SuffixPrefix` array. We will use the method based on the [Z-Algorithm](../Z-Algorithm/). This function takes in input the pattern and produces an array of integers. Each element represents the length of the longest substring starting at position `i` of `P` and that matches a prefix of `P`. You can notice that the two arrays are similar, they record the same informations but on the different places. We only have to find a method to map `Z[i]` to `suffixPrefix[j]`. It is not that difficult and this is the code that will do for us:
23+
24+
```swift
25+
for patternIndex in (1 ..< patternLength).reversed() {
26+
textIndex = patternIndex + zeta![patternIndex] - 1
27+
suffixPrefix[textIndex] = zeta![patternIndex]
28+
}
29+
```
30+
31+
We are simply computing the index of the end of the substring starting at position `i` (as we know matches a prefix of `P`). The element of `suffixPrefix` at that index then it will be set with the length of the substring.
32+
33+
Once the shift-array `suffixPrefix` is ready we can begin with pattern search stage. The algorithm first attempts to compare the characters of the text with those of the pattern. If it succeeds, it goes on until a mismatch occurs. When it happens, it checks if an occurrence of the pattern is present (and reports it). Otherwise, if no comparisons are made then the text cursor is moved forward, else the pattern is shifted to the right. The shift's amount is based on the `suffixPrefix` array, and it guarantees that the prefix `P[0...suffixPrefix[i]]` will match its opposing substring in the text. In this way, shifts of more than one character are often made and lot of comparisons can be avoided, saving a lot of time.
34+
35+
Here is the code of the Knuth-Morris-Pratt algorithm:
36+
37+
```swift
38+
extension String {
39+
40+
func indexesOf(ptnr: String) -> [Int]? {
41+
42+
let text = Array(self.characters)
43+
let pattern = Array(ptnr.characters)
44+
45+
let textLength: Int = text.count
46+
let patternLength: Int = pattern.count
47+
48+
guard patternLength > 0 else {
49+
return nil
50+
}
51+
52+
var suffixPrefix: [Int] = [Int](repeating: 0, count: patternLength)
53+
var textIndex: Int = 0
54+
var patternIndex: Int = 0
55+
var indexes: [Int] = [Int]()
56+
57+
/* Pre-processing stage: computing the table for the shifts (through Z-Algorithm) */
58+
let zeta = ZetaAlgorithm(ptnr: ptnr)
59+
60+
for patternIndex in (1 ..< patternLength).reversed() {
61+
textIndex = patternIndex + zeta![patternIndex] - 1
62+
suffixPrefix[textIndex] = zeta![patternIndex]
63+
}
64+
65+
/* Search stage: scanning the text for pattern matching */
66+
textIndex = 0
67+
patternIndex = 0
68+
69+
while textIndex + (patternLength - patternIndex - 1) < textLength {
70+
71+
while patternIndex < patternLength && text[textIndex] == pattern[patternIndex] {
72+
textIndex = textIndex + 1
73+
patternIndex = patternIndex + 1
74+
}
75+
76+
if patternIndex == patternLength {
77+
indexes.append(textIndex - patternIndex)
78+
}
79+
80+
if patternIndex == 0 {
81+
textIndex = textIndex + 1
82+
} else {
83+
patternIndex = suffixPrefix[patternIndex - 1]
84+
}
85+
}
86+
87+
guard !indexes.isEmpty else {
88+
return nil
89+
}
90+
return indexes
91+
}
92+
}
93+
```
94+
95+
Let's make an example reasoning with the code above. Let's consider the string `P = ACTGACTA"`, the consequentially obtained `suffixPrefix` array equal to `[0, 0, 0, 0, 0, 0, 3, 1]`, and the text `T = "GCACTGACTGACTGACTAG"`. The algorithm begins with the text and the pattern aligned like below. We have to compare `T[0]` with `P[0]`.
96+
97+
1
98+
0123456789012345678
99+
text: GCACTGACTGACTGACTAG
100+
textIndex: ^
101+
pattern: ACTGACTA
102+
patternIndex: ^
103+
x
104+
suffixPrefix: 00000031
105+
106+
We have a mismatch and we move on comparing `T[1]` and `P[0]`. We have to check if a pattern occurrence is present but there is not. So, we have to shift the pattern right and by doing so we have to check `suffixPrefix[1 - 1]`. Its value is `0` and we restart by comparing `T[1]` with `P[0]`. Again a mismath occurs, so we go on with `T[2]` and `P[0]`.
107+
108+
1
109+
0123456789012345678
110+
text: GCACTGACTGACTGACTAG
111+
textIndex: ^
112+
pattern: ACTGACTA
113+
patternIndex: ^
114+
suffixPrefix: 00000031
115+
116+
This time we have a match. And it continues until position `8`. Unfortunately the length of the match is not equal to the pattern length, we cannot report an occurrence. But we are still lucky because we can use the values computed in the `suffixPrefix` array now. In fact, the length of the match is `7`, and if we look at the element `suffixPrefix[7 - 1]` we discover that is `3`. This information tell us that that the prefix of `P` matches the suffix of the susbtring `T[0...8]`. So the `suffixPrefix` array guarantees us that the two substring match and that we do not have to compare their characters, so we can shift right the pattern for more than one character!
117+
The comparisons restart from `T[9]` and `P[3]`.
118+
119+
1
120+
0123456789012345678
121+
text: GCACTGACTGACTGACTAG
122+
textIndex: ^
123+
pattern: ACTGACTA
124+
patternIndex: ^
125+
suffixPrefix: 00000031
126+
127+
They match so we continue the compares until position `13` where a misatch occurs beetwen charcter `G` and `A`. Just like before, we are lucky and we can use the `suffixPrefix` array to shift right the pattern.
128+
129+
1
130+
0123456789012345678
131+
text: GCACTGACTGACTGACTAG
132+
textIndex: ^
133+
pattern: ACTGACTA
134+
patternIndex: ^
135+
suffixPrefix: 00000031
136+
137+
Again, we have to compare. But this time the comparisons finally take us to an occurrence, at position `17 - 7 = 10`.
138+
139+
1
140+
0123456789012345678
141+
text: GCACTGACTGACTGACTAG
142+
textIndex: ^
143+
pattern: ACTGACTA
144+
patternIndex: ^
145+
suffixPrefix: 00000031
146+
147+
The algorithm than tries to compare `T[18]` with `P[1]` (because we used the element `suffixPrefix[8 - 1] = 1`) but it fails and at the next iteration it ends its work.
148+
149+
150+
The pre-processing stage involves only the pattern. The running time of the Z-Algorithm is linear and takes `O(n)`, where `n` is the length of the pattern `P`. After that, the search stage does not "overshoot" the length of the text `T` (call it `m`). It can be be proved that number of comparisons of the search stage is bounded by `2 * m`. The final running time of the Knuth-Morris-Pratt algorithm is `O(n + m)`.
151+
152+
153+
> **Note:** To execute the code in the [KnuthMorrisPratt.swift](./KnuthMorrisPratt.swift) you have to copy the [ZAlgorithm.swift](../Z-Algorithm/ZAlgorithm.swift) file contained in the [Z-Algorithm](../Z-Algorithm/) folder. The [KnuthMorrisPratt.playground](./KnuthMorrisPratt.playground) already includes the definition of the `Zeta` function.
154+
155+
Credits: This code is based on the handbook ["Algorithm on String, Trees and Sequences: Computer Science and Computational Biology"](https://books.google.it/books/about/Algorithms_on_Strings_Trees_and_Sequence.html?id=Ofw5w1yuD8kC&redir_esc=y) by Dan Gusfield, Cambridge University Press, 1997.
156+
157+
*Written for Swift Algorithm Club by Matteo Dunnhofer*

0 commit comments

Comments
 (0)