-
Notifications
You must be signed in to change notification settings - Fork 74
Expand file tree
/
Copy pathHomoglyph.java
More file actions
168 lines (142 loc) · 5.96 KB
/
Homoglyph.java
File metadata and controls
168 lines (142 loc) · 5.96 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
package net.codebox.homoglyph;
import java.util.*;
/**
* Use this class to detect occurrences of target words inside a String, where the target words may have been
* disguised using homoglyph substitution and/or by mixing upper/lower case letters (for example, the class
* will find the word "credit" in the String "Free Ꮯгⴹԁ1t").
*
* You can supply your own list of homoglyphs, or use the char_codes.txt which should accompany this source file.
* You can find the latest version of char_codes.txt at https://github.com/codebox/homoglyph
*
* @author Rob Dawson
*/
public class Homoglyph {
private final List<Set<Integer>> homoglyphs = new ArrayList<>();
private final CachingLookup cache = new CachingLookup();
/**
* Supply a List of Sets, with each Set containing a group of Unicode codepoints that are homoglyphs. Codepoints
* must be represented using Integer rather than Character values because some are too large to be held by the
* 16-bit Character type.
*
* @param homoglyphs a List of Sets, with each Set containing a group of Unicode codepoints that are homoglyphs
*/
public Homoglyph(final List<Set<Integer>> homoglyphs) {
this.homoglyphs.addAll(homoglyphs);
}
/**
* Search the String {@code text} to locate all occurrences of the words contained in {@code targetWords},
* accounting for homoglyph substitution and variations of case.
*
* @param text text to be searched
* @param targetWords words to be located
* @return a List containing the results of the search, if no matches were found an empty list will be returned
*/
public List<SearchResult> search(final String text, final Collection<String> targetWords) {
final List<SearchResult> allResults = new ArrayList<>();
final CodePoints textCodepoints = new CodePoints(text);
for (final String targetWord : targetWords) {
allResults.addAll(checkForWord(textCodepoints, new CodePoints(targetWord)));
}
return allResults;
}
/**
* Search the String {@code text} to locate all occurrences of the words contained in {@code targetWords},
* accounting for homoglyph substitution and variations of case.
*
* @param text text to be searched
* @param targetWords words to be located
* @return a List containing the results of the search, if no matches were found an empty list will be returned
*/
public List<SearchResult> search(final String text, final String... targetWords) {
return search(text, Arrays.asList(targetWords));
}
private Collection<SearchResult> checkForWord(final CodePoints text, final CodePoints targetWord) {
final Collection<SearchResult> results = new ArrayList<>();
int lastIndex = text.getLength() - targetWord.getLength();
for (int i = 0; i <= lastIndex; i++) {
if (hasWordAtIndex(text, targetWord, i)) {
results.add(new SearchResult(i, text.subStringAt(i, targetWord.getLength()), targetWord.getText()));
}
}
return results;
}
private boolean hasWordAtIndex(final CodePoints text, final CodePoints targetWord, final int index) {
for (int i=0; i<targetWord.getLength(); i++) {
final int targetCharLower = Character.toLowerCase(targetWord.getValue(i));
final int targetCharUpper = Character.toUpperCase(targetWord.getValue(i));
final int textChar = text.getValue(index + i);
if (!checkForHomoglyphs(targetCharLower, textChar) && !checkForHomoglyphs(targetCharUpper, textChar)){
return false;
}
}
return true;
}
private boolean checkForHomoglyphs(final int cp1, final int cp2) {
final Set<Integer> cp1Set = cache.lookup(cp1);
return cp1Set.contains(cp2);
}
public static class SearchResult {
public SearchResult(final int index, final String match, final String word){
this.index = index;
this.match = match;
this.word = word;
}
public int index;
public String match;
public String word;
@Override
public String toString() {
return String.format("'%s' at position %s matches '%s'", match, index, word);
}
}
public static class CodePoints {
private final Integer[] codepoints;
private final String text;
public CodePoints(String text){
this.text = text;
final List<Integer> codepointList = new ArrayList<>();
int codepoint;
for (int offset = 0; offset < text.length(); ) {
codepointList.add(codepoint = text.codePointAt(offset));
offset += Character.charCount(codepoint);
}
codepoints = codepointList.toArray(new Integer[0]);
}
public int getValue(int i) {
return codepoints[i];
}
public int getLength() {
return codepoints.length;
}
public String getText() {
return text;
}
public String subStringAt(final int s, final int l) {
final StringBuilder sb = new StringBuilder(l);
for (int i=0; i<l; i++){
sb.appendCodePoint(this.codepoints[s+i]);
}
return sb.toString();
}
}
public class CachingLookup {
private final Map<Integer, Set<Integer>> lookup = new HashMap<>();
public Set<Integer> lookup(final int cp) {
Set<Integer> s = lookup.get(cp);
if (s == null){
for (Set<Integer> thisSet : homoglyphs){
if (thisSet.contains(cp)){
s = thisSet;
break;
}
}
if (s == null){
s = new HashSet<>();
s.add(cp);
}
lookup.put(cp, s);
}
return s;
}
}
}