演算法#15--子字串查詢演算法彙總和程式碼詳解
阿新 • • 發佈:2019-01-07
1.演算法彙總
首先,來看一張彙總表,本文會將表裡的每種演算法作詳細介紹。程式碼和邏輯比較長,可以根據目錄跳著看。
2.暴力演算法
在文字中可能出現匹配的任何地方都檢查是否存在。原理很簡單,直接看程式碼就可以懂。
//暴力子字串查詢
public class ViolenceSubStringSearch
{
@SuppressWarnings("unused")
public static int search(String pat, String txt)
{
int M = pat.length();
int N = txt.length();
for(int i = 0; i <= N-M; i++)
{
int j;
for(j = 0; j < M; j++)
{
if(txt.charAt(i + j) != pat.charAt(j));
break;
}
if(j == M)
{
return i; //找到匹配
}
}
return N; //未找到匹配
}
}
執行軌跡:
3.KMP演算法
KMP演算法的基本思想是當出現不匹配是,就能知曉一部分文字的內容(因為在匹配失敗之前它們已經和模式匹配)。我們可以利用這些資訊避免將指標回退到所有這些已知的字元之前。
KMP的主要思想是提前判斷如何重新開始查詢,而這種判斷只取決於模式本身。
在KMP子字串查詢演算法中,不會回退文字指標i,而是使用一個數組dfa[][]來記錄匹配失敗時模式指標j應該回退多遠。dfa[][]稱為確定有限狀態自動機(DFA)。
如何構造dfa,
和回退是的處理方式相同,除非在pat.charAt(j)處匹配成功,這時DFA應該前進到狀態j+1.例如,對於ABABAC,要判斷在j=5時匹配失敗後DFA應該怎麼做。通過DFA可以知道完全回退之後演算法會掃描BABA併到達狀態3,因此可以將dfa[][3]複製到dfa[][5]並將C所對飲的元素的值設為6.因為在計算DFA的地j個狀態時只需要知道DFA是如何處理前j-1個字元的,所以總能從尚不完整的DFA中得到所需的資訊。
最後一個關鍵的細節,如何維護重啟位置X,因為X< j,所以可以由已經構造的DFA部分來完成這個任務–X的下一個值是dfa[pat.charAt(j)][X].
總結下,對於每個j,DFA會:
- 將dfa[][X]複製到dfa[][j](對於失敗的情況)
- 將dfa[pat.charAt(j)][j]設為j+1(對於匹配成功的情況)
- 更新X。
如下圖:
//KMP子字串查詢
public class KMP
{
private final int R; // the radix
private int[][] dfa; // the KMP automoton
private char[] pattern; // either the character array for the pattern
private String pat; // or the pattern string
/**
* Preprocesses the pattern string.
*
* @param pat the pattern string
*/
public KMP(String pat)
{
this.R = 256;
this.pat = pat;
// build DFA from pattern
int m = pat.length();
dfa = new int[R][m];
dfa[pat.charAt(0)][0] = 1;
for (int x = 0, j = 1; j < m; j++)
{
for (int c = 0; c < R; c++)
{
dfa[c][j] = dfa[c][x]; // Copy mismatch cases.
}
dfa[pat.charAt(j)][j] = j+1; // Set match case.
x = dfa[pat.charAt(j)][x]; // Update restart state.
}
}
/**
* Preprocesses the pattern string.
*
* @param pattern the pattern string
* @param R the alphabet size
*/
public KMP(char[] pattern, int R)
{
this.R = R;
this.pattern = new char[pattern.length];
for (int j = 0; j < pattern.length; j++)
{
this.pattern[j] = pattern[j];
}
// build DFA from pattern
int m = pattern.length;
dfa = new int[R][m];
dfa[pattern[0]][0] = 1;
for (int x = 0, j = 1; j < m; j++)
{
for (int c = 0; c < R; c++)
{
dfa[c][j] = dfa[c][x]; // Copy mismatch cases.
}
dfa[pattern[j]][j] = j+1; // Set match case.
x = dfa[pattern[j]][x]; // Update restart state.
}
}
/**
* Returns the index of the first occurrrence of the pattern string
* in the text string.
*
* @param txt the text string
* @return the index of the first occurrence of the pattern string
* in the text string; N if no such match
*/
public int search(String txt)
{
// simulate operation of DFA on text
int m = pat.length();
int n = txt.length();
int i, j;
for (i = 0, j = 0; i < n && j < m; i++)
{
j = dfa[txt.charAt(i)][j];
}
if (j == m) return i - m; // found
return n; // not found
}
/**
* Returns the index of the first occurrrence of the pattern string
* in the text string.
*
* @param text the text string
* @return the index of the first occurrence of the pattern string
* in the text string; N if no such match
*/
public int search(char[] text)
{
// simulate operation of DFA on text
int m = pattern.length;
int n = text.length;
int i, j;
for (i = 0, j = 0; i < n && j < m; i++)
{
j = dfa[text[i]][j];
}
if (j == m) return i - m; // found
return n; // not found
}
/**
* Takes a pattern string and an input string as command-line arguments;
* searches for the pattern string in the text string; and prints
* the first occurrence of the pattern string in the text string.
*
* @param args the command-line arguments
*/
public static void main(String[] args)
{
String pat = "AACAA";
String txt = "AABRAACADABRAACAADABRA";
char[] pattern = pat.toCharArray();
char[] text = txt.toCharArray();
KMP kmp1 = new KMP(pat);
int offset1 = kmp1.search(txt);
KMP kmp2 = new KMP(pattern, 256);
int offset2 = kmp2.search(text);
// print results
System.out.println("text: " + txt);
System.out.print("pattern: ");
for (int i = 0; i < offset1; i++)
System.out.print(" ");
System.out.println(pat);
System.out.print("pattern: ");
for (int i = 0; i < offset2; i++)
System.out.print(" ");
System.out.println(pat);
}
}
輸出:
text: AABRAACADABRAACAADABRA
pattern: AACAA
pattern: AACAA
4.BoyerMoore演算法
從右往左掃描,跳躍式匹配。用right[]來記錄跳躍表,它等於字元出現在模式中的位置,沒出現賦值為-1.
對於匹配失敗,有如下三種情況:
- 造成匹配失敗的字元不包含在模式字串中,將模式字串向右移動j+1個位置(即將i增加j+1)。
- 造成匹配失敗的字元包含在模式字串中,就可以用right[]陣列來講模式字串和文字對其,使得該字元和它在模式字串中出現的最右位置相匹配。
- 如果這種方式無法增大i,那就直接將i+1來保證模式字串至少向右移動了一個位置。
//BoyerMoore字串匹配演算法(啟發式地處理不匹配的字元)
public class BoyerMoore
{
private final int R; // the radix
private int[] right; // the bad-character skip array
private char[] pattern; // store the pattern as a character array
private String pat; // or as a string
/**
* Preprocesses the pattern string.
*
* @param pat the pattern string
*/
public BoyerMoore(String pat)
{
this.R = 256;
this.pat = pat;
// position of rightmost occurrence of c in the pattern
right = new int[R];
for (int c = 0; c < R; c++)
{
right[c] = -1; //不包含在模式字串中的字元的值為-1
}
for (int j = 0; j < pat.length(); j++)
{//包含在模式字串中的字元的值為它在其中出現的最右位置
right[pat.charAt(j)] = j;
}
}
/**
* Preprocesses the pattern string.
*
* @param pattern the pattern string
* @param R the alphabet size
*/
public BoyerMoore(char[] pattern, int R)
{
this.R = R;
this.pattern = new char[pattern.length];
for (int j = 0; j < pattern.length; j++)
{
this.pattern[j] = pattern[j];
}
// position of rightmost occurrence of c in the pattern
right = new int[R];
for (int c = 0; c < R; c++)
{
right[c] = -1;
}
for (int j = 0; j < pattern.length; j++)
{
right[pattern[j]] = j;
}
}
/**
* Returns the index of the first occurrrence of the pattern string
* in the text string.
*
* @param txt the text string
* @return the index of the first occurrence of the pattern string
* in the text string; n if no such match
*/
public int search(String txt)
{
int m = pat.length();
int n = txt.length();
int skip;
for (int i = 0; i <= n - m; i += skip)
{
skip = 0;
for (int j = m-1; j >= 0; j--)
{
if (pat.charAt(j) != txt.charAt(i+j))
{
skip = Math.max(1, j - right[txt.charAt(i+j)]);
break;
}
}
if (skip == 0) return i; // found
}
return n; // not found
}
/**
* Returns the index of the first occurrrence of the pattern string
* in the text string.
*
* @param text the text string
* @return the index of the first occurrence of the pattern string
* in the text string; n if no such match
*/
public int search(char[] text)
{
int m = pattern.length;
int n = text.length;
int skip;
for (int i = 0; i <= n - m; i += skip)
{
skip = 0;
for (int j = m-1; j >= 0; j--)
{
if (pattern[j] != text[i+j])
{
skip = Math.max(1, j - right[text[i+j]]);
break;
}
}
if (skip == 0) return i; // found
}
return n; // not found
}
/**
* Takes a pattern string and an input string as command-line arguments;
* searches for the pattern string in the text string; and prints
* the first occurrence of the pattern string in the text string.
*
* @param args the command-line arguments
*/
public static void main(String[] args)
{
String pat = "AACAA";
String txt = "AABRAACADABRAACAADABRA";
char[] pattern = pat.toCharArray();
char[] text = txt.toCharArray();
BoyerMoore boyermoore1 = new BoyerMoore(pat);
BoyerMoore boyermoore2 = new BoyerMoore(pattern, 256);
int offset1 = boyermoore1.search(txt);
int offset2 = boyermoore2.search(text);
// print results
System.out.println("text: " + txt);
System.out.print("pattern: ");
for (int i = 0; i < offset1; i++)
System.out.print(" ");
System.out.println(pat);
System.out.print("pattern: ");
for (int i = 0; i < offset2; i++)
System.out.print(" ");
System.out.println(pat);
}
}
輸出:
text: AABRAACADABRAACAADABRA
pattern: AACAA
pattern: AACAA
5.RabinKarp演算法
計算模式字串的雜湊函式,然後用相同的雜湊函式計算文字中所有可能的M個字元的子字串雜湊值並尋找匹配。
import java.math.BigInteger;
import java.util.Random;
//RabinKarp指紋字串查詢演算法
public class RabinKarp
{
private String pat; // the pattern // needed only for Las Vegas
private long patHash; // pattern hash value
private int m; // pattern length
private long q; // a large prime, small enough to avoid long overflow
private int R; // radix
private long RM; // R^(M-1) % Q
/**
* Preprocesses the pattern string.
*
* @param pattern the pattern string
* @param R the alphabet size
*/
public RabinKarp(char[] pattern, int R)
{
throw new UnsupportedOperationException("Operation not supported yet");
}
/**
* Preprocesses the pattern string.
*
* @param pat the pattern string
*/
public RabinKarp(String pat)
{
this.pat = pat; // save pattern (needed only for Las Vegas)
R = 256;
m = pat.length();
q = longRandomPrime();
// precompute R^(m-1) % q for use in removing leading digit
RM = 1;
for (int i = 1; i <= m-1; i++)
{
RM = (R * RM) % q;
}
patHash = hash(pat, m);
}
// Compute hash for key[0..m-1].
private long hash(String key, int m)
{
long h = 0;
for (int j = 0; j < m; j++)
{
h = (R * h + key.charAt(j)) % q;
}
return h;
}
// Las Vegas version: does pat[] match txt[i..i-m+1] ?
private boolean check(String txt, int i)
{
for (int j = 0; j < m; j++)
{
if (pat.charAt(j) != txt.charAt(i + j))
{
return false;
}
}
return true;
}
// Monte Carlo version: always return true
@SuppressWarnings("unused")
private boolean check(int i)
{
return true;
}
/**
* Returns the index of the first occurrrence of the pattern string
* in the text string.
*
* @param txt the text string
* @return the index of the first occurrence of the pattern string
* in the text string; n if no such match
*/
public int search(String txt)
{
int n = txt.length();
if (n < m) return n;
long txtHash = hash(txt, m);
// check for match at offset 0
if ((patHash == txtHash) && check(txt, 0))
{
return 0;
}
// check for hash match; if hash match, check for exact match
for (int i = m; i < n; i++)
{
// Remove leading digit, add trailing digit, check for match.
txtHash = (txtHash + q - RM*txt.charAt(i-m) % q) % q;
txtHash = (txtHash*R + txt.charAt(i)) % q;
// match
int offset = i - m + 1;
if ((patHash == txtHash) && check(txt, offset))
{
return offset;
}
}
// no match
return n;
}
// a random 31-bit prime
private static long longRandomPrime()
{
BigInteger prime = BigInteger.probablePrime(31, new Random());
return prime.longValue();
}
/**
* Takes a pattern string and an input string as command-line arguments;
* searches for the pattern string in the text string; and prints
* the first occurrence of the pattern string in the text string.
*
* @param args the command-line arguments
*/
public static void main(String[] args)
{
String pat = "AACAA";
String txt = "AABRAACADABRAACAADABRA";
RabinKarp searcher = new RabinKarp(pat);
int offset = searcher.search(txt);
// print results
System.out.println("text: " + txt);
// from brute force search method 1
System.out.print("pattern: ");
for (int i = 0; i < offset; i++)
System.out.print(" ");
System.out.println(pat);
}
}