Trie
208. Implement Trie (Prefix Tree)
class TrieNode {
public:
// Initialize your data structure here.
bool isLeaf {false};
TrieNode* child[26] = {nullptr};
// shared_ptr<TrieNode> child[26];
TrieNode() {
}
};
class Trie {
public:
Trie() {
// root = make_shared<TrieNode>();
root = new TrieNode();
}
// Inserts a word into the trie.
void insert(string word) {
auto node = root;
for (auto c : word) {
if (!node->child[c-'a']) {
// node->child[c-'a'] = make_shared<TrieNode>();
node->child[c-'a'] = new TrieNode();
}
node = node->child[c-'a'];
}
node->isLeaf = true;
}
// Returns if the word is in the trie.
bool search(string word) {
auto node = root;
for (auto c : word) {
if (!node->child[c-'a']) return false;
node = node->child[c-'a'];
}
return node->isLeaf;
}
// Returns if there is any word in the trie
// that starts with the given prefix.
bool startsWith(string prefix) {
auto node = root;
for (auto c : prefix) {
if (!node->child[c-'a']) return false;
node = node->child[c-'a'];
}
return true;
}
private:
// shared_ptr<TrieNode> root;
TrieNode* root;
};
// Your Trie object will be instantiated and called as such:
// Trie trie;
// trie.insert("somestring");
// trie.search("key");
212. Word Search II
Given a 2D board and a list of words from the dictionary, find all words in the board.
Each word must be constructed from letters of sequentially adjacent cell, where "adjacent" cells are those horizontally or vertically neighboring. The same letter cell may not be used more than once in a word.
For example, Given words = ["oath","pea","eat","rain"] and board = [ ['o','a','a','n'], ['e','t','a','e'], ['i','h','k','r'], ['i','f','l','v'] ] Return ["eat","oath"].
Note: You may assume that all inputs are consist of lowercase letters a-z.
class TrieNode {
public:
// Initialize your data structure here.
bool isLeaf {false};
TrieNode* child[26] = {nullptr};
TrieNode() {
}
};
class Trie {
public:
Trie() {
root = new TrieNode();
}
// Inserts a word into the trie.
void insert(string word) {
auto node = root;
for (auto c : word) {
if (!node->child[c-'a']) {
node->child[c-'a'] = new TrieNode();
}
node = node->child[c-'a'];
}
node->isLeaf = true;
}
// Returns if the word is in the trie.
bool search(string word) {
auto node = root;
for (auto c : word) {
if (!node->child[c-'a']) return false;
node = node->child[c-'a'];
}
return node->isLeaf;
}
// Returns if there is any word in the trie
// that starts with the given prefix.
bool startsWith(string prefix) {
auto node = root;
for (auto c : prefix) {
if (!node->child[c-'a']) return false;
node = node->child[c-'a'];
}
return true;
}
private:
TrieNode* root;
};
class Solution {
public:
vector<string> findWords(vector<vector<char>>& board, vector<string>& words) {
Trie trie;
for (auto word : words) trie.insert(word);
set<string> result;
int m = board.size();
if (!m) return {};
int n= board[0].size();
if (!n) return {};
vector<vector<bool>> visited(m, vector<bool>(n, false));
for (int i = 0; i < m; ++i) {
for (int j = 0; j < n; ++j) {
dfs(board, i, j, trie, "", result, visited);
}
}
vector<string> res(result.begin(), result.end());
return res;
}
void dfs(vector<vector<char>>& board, int x, int y, Trie& trie, string word, set<string>& res, vector<vector<bool>>& visited) {
if (visited[x][y]) return;
word += board[x][y];
if (trie.search(word)) {
res.insert(word);
}
if (!trie.startsWith(word)) return;
visited[x][y] = true;
vector<int> steps{0, 1, 0, -1, 0};
for (size_t i = 0; i < steps.size() - 1; ++i) {
int new_x = x + steps[i];
int new_y = y + steps[i+1];
if (new_x >= 0 && new_x < board.size() && new_y >= 0 && new_y < board[0].size()) {
dfs(board, new_x, new_y, trie, word , res, visited);
}
}
visited[x][y] = false;
}
};
Trie Serialization
Serialize and deserialize a trie (prefix tree, search on internet for more details).
You can specify your own serialization algorithm, the online judge only cares about whether you can successfully deserialize the output from your own serialize function.
Example
str = serialize(old_trie)
>> str can be anything to represent a trie
new_trie = deserialize(str)
>> new_trie should have the same structure and values with old_trie
An example of test data: trie tree <a<b<e<>>c<>d<f<>>>>, denote the following structure:
root
/
a
/ | \
b c d
/ \
e f
/**
* Definition of TrieNode:
* class TrieNode {
* public:
* TrieNode() {}
* map<char, TrieNode*> children;
* };
*/
class Solution {
public:
/**
* This method will be invoked first, you should design your own algorithm
* to serialize a trie which denote by a root node to a string which
* can be easily deserialized by your own "deserialize" method later.
*/
string serialize(TrieNode* root) {
// Write your code here
if (!root) return "";
string res;
for (auto kv : root->children) {
res += kv.first;
res += serialize(kv.second);
}
return "<"+res+">";
}
/**
* This method will be invoked second, the argument data is what exactly
* you serialized at method "serialize", that means the data is not given by
* system, it's given by your own serialize method. So the format of data is
* designed by yourself, and deserialize it here as you serialize it in
* "serialize" method.
*/
TrieNode* deserialize(string data) {
// Write your code here
TrieNode* root = new TrieNode();
stack<TrieNode*> st;
TrieNode* cur = root;
for (int i = 0; i < data.size(); ++i) {
if (data[i] == '<') {
st.push(cur);
} else if (data[i] == '>') {
st.pop();
} else {
cur = new TrieNode();
st.top()->children[data[i]] = cur;
}
}
return root;
}
};
Get Similar Words
给定一个word list 和一个target word,要求输出在word list 里跟target word的edit distance 相差不大于k的所有words。
直接用edit distance挨个遍历一遍也能做,但是如果word list很大,那重复计算会非常多,这时候可以用trie来优化。 下面举个例,假设字典为["cs", "ct", "cby"],target word为"cat",k=1。 首先建Trie:
c
/ | \
b s t
/
y
从根节点开始搜索,遍历的单词分别为:c -> cb -> (cby) -> cs -> ct。 与普通的Edit distance动态规划算法一样,我们对每一个“路过”的单词记录一个DP table。那么所遍历的单词的DP table应该为(假设当前遍历单词,也就是下面代码中的path为”c”):
c a t
[ 0 1 2 3 ] <- prev_dist
-> c [ 1 0 1 2 ] <- cur_dist
cb [ 2 1 1 2 ]
cs [ 2 1 1 2 ]
ct [ 2 1 1 1 ]
每一行的最后一位数字即为当前单词与target word的edit distance。显然,如果该值小于等于k,则加入到结果中。最终返回的结果只有一个单词,即ct。 注意,遍历到单词cb时,edit distance已经为2,且长度已经与cat相等,也就意味着该节点的子树中所包含的单词与target word的edit distance无论如何都不可能小于等于k,因此直接从该子树返回。所以单词cby是没有被遍历到的。这也就是Trie做这题的便利之处,当字典较大时会明显提高效率。
代码如下:
class Trie {
public:
struct Node {
Node(char val): value{val}{}
~Node() {
for(int i = 0; i < 27; ++i) {
if (children[i]) {
delete children[i];
children[i] = nullptr;
}
}
}
Node* get(char c, bool auto_create = true) {
assert((c >= 'a' && c <= 'z') || c == '\0');
int id = c == '\0' ? 26 : c - 'a';
if (!children[id] && auto_create) {
children[id] = new Node(c);
children[id]->parent = this;
num_children++;
}
return children[id];
}
bool end() {
return get('\0',false);
}
int num_children { 0 };
char value{ 0 };
Node* parent{ nullptr };
Node* children[27] { nullptr };
};
public:
Trie() {
_root = new Node(-1);
}
~Trie() {
if (_root) {
delete _root;
_root = nullptr;
}
}
void add_string(const string& str) {
auto node = _root;
for (auto c : str) {
node = node->get(c);
}
node->get('\0');
}
vector<string> get_similar(const string& s, int threshold) {
vector<string> ret;
function<void(Node*, string&, vector<int>&)> search = [&](Node* node,
string& path, vector<int>& prev_dist) {
if (node->end() && prev_dist.back() <= threshold)
ret.push_back(path);
for(int i = 0; i < 26; ++i) {
if (!node->children[i]) continue;
vector<int> cur_dist = prev_dist;
cur_dist[0]++;
char letter = 'a' + i;
bool go = cur_dist[0] <= threshold;
for (int len = 1; len <= s.size(); ++len) {
int ins_cost = cur_dist[len - 1] + 1;
int del_cost = prev_dist[len] + 1;
int rep_cost = 0;
if (s[len - 1] != letter)
rep_cost = prev_dist[len - 1] + 1;
else
rep_cost = prev_dist[len - 1];
cur_dist[len] = min(min(ins_cost, del_cost), rep_cost);
if (cur_dist[len] <= threshold) go = true;
}
if (go) {
path.push_back(letter);
search(node->children[i], path, cur_dist);
path.pop_back();
}
}
};
if(s.empty()) return ret;
string p;
vector<int> dist(s.size() + 1);
iota(dist.begin(), dist.end(), 0);
search(_root, p, dist);
return ret;
}
private:
Node* _root{ nullptr };
};
Search Pattern
给一个dictionary, 再给一个set of coding string (g5, goo3, goog2, go2le………). return all string from dictionary that can be matched with the coding string. 要求尽量减少dictionary look up 次数。
因为给了一个字典,可能包含单词非常多,可以用trie来预处理。trie的代码请见Get Similar Words中的实现。下面只给出搜索函数
vector<string> search_num_pattern(const string& str) {
vector<string> ret;
function<void(int,int,string,Node*)> search = [&](int id, int num, string cur, Node* node){
if(!node) return;
if (id > str.length()) {
if (node->value == '\0') {
cur.resize(cur.length() - 1);
ret.push_back(cur);
}
return;
}
if (num > 0) {
for(int i = 0; i < 26; ++i) {
if (node->children[i])
search(id, num-1, cur + node->children[i]->value, node->children[i]);
}
} else {
char c = id < str.length() ? str[id] : '\0';
if (c >= '0' && c <= '9') {
int over_id = id + 1;
for(; over_id < str.length() && str[over_id] >= '0' && str[over_id] <= '9'; ++over_id);
int cur_num = atoi(str.substr(id, over_id - id).c_str()) - 1;
for(int i = 0; i < 26; ++i) {
if (node->children[i])
search(over_id, cur_num, cur + node->children[i]->value, node->children[i]);
}
} else {
search(id+1, 0, cur + c, node->get(str[id], false));
}
}
};
search(0, 0, "", _root);
return ret;
}