/********************************************************** File: porter.java Description: Functions for applying the Porter transformation to a word. Author: Dana Vrajitoru and Stuart J. Barr Organization: IUSB Updated: February 2021 ***********************************************************/ package hashTable; public class Porter { static final int KEYWORDSIZE = 25; static final boolean PREFIXES = true; static final int BIG_KEYWORDSIZE = KEYWORDSIZE + 20; static String [] prefixes = { "kilo", "micro", "milli", "intra", "ultra", "mega", "nano", "pico", "pseudo"}; // extract word stem static public String stripAffixes (String text) { text = text.toLowerCase(); text = clean(text); text = stripPrefixes(text); text = stripSuffixes(text); return text; //text[KEYWORDSIZE] = '\0'; } // stripAffixes() // remove everything that is not a letter or a digit static public String clean (String kwd) { int i, last = kwd.length(); String result = ""; for ( i = 0 ; i < last ; i++ ) { if ( isvalid(kwd.charAt(i)) ) result = result + kwd.charAt(i); } return result; } // clean // returns 0 if the character is a letter or a digit, 1 otherwise static public boolean isvalid(char l) { if ( (l >= 'a') && (l <= 'z') ) return true; if ( (l >= 'A') && (l <= 'Z') ) return true; if ( (l >= '0') && (l <= '9') ) return true; return false; } // isvalid() // remove composing prefixes such as "kilo". static public String stripPrefixes (String text ) { for ( int i = 0 ; i < prefixes.length ; i++ ) { text = text.replace(prefixes[i], ""); } return text; } // stripPrefixes() // remove composing pieces at the end of the text, such as "ly" static public String stripSuffixes ( String text ) { text = step1 ( text ); text = step2 ( text ); text = step3 ( text ); text = step4 ( text ); return step5 ( text ); } // stripSuffixes() static public String step1 ( String text ) { char [] stem = new char[BIG_KEYWORDSIZE]; if ( last(text) == 's' ) { if ( hasSuffix(text, "sses", stem) || hasSuffix(text, "ies", stem) ) text = removeEnd(text, 2); else if ( text.charAt(text.length() - 2) != 's' ) text = removeEnd(text, 1); } if ( hasSuffix(text,"eed",stem) == true ) { if ( measure(stem) > 0 ) text = removeEnd(text, 1); } else { if ( ( hasSuffix(text, "ed", stem) || hasSuffix(text, "ing", stem)) && containsVowel(String.valueOf(stem))) { text = text.substring(0, realSize(stem)); if ( hasSuffix(text, "at", stem) || hasSuffix(text, "bl", stem) || hasSuffix(text, "iz", stem)) { text = text + "e"; } else { int length = text.length(); if ( text.charAt(length-1) == text.charAt(length-2) && last(text) != 'l' && last(text) != 's' && last(text) != 'z' ) text = removeEnd(text, 1); else if ( measure(text) == 1 ) { if ( cvc(text) ) { text = text + "e"; } } } } } if ( hasSuffix(text, "y", stem) && containsVowel(String.valueOf(stem)) ) text = removeEnd(text, 1) + 'i'; return text; } // step_1() static public String step2 ( String text ) { String [][] suffixes = { { "ational", "ate" }, { "tional", "tion" }, { "enci", "ence" }, { "anci", "ance" }, { "izer", "ize" }, { "iser", "ize" }, { "abli", "able" }, { "alli", "al" }, { "entli", "ent" }, { "eli", "e" }, { "ousli", "ous" }, { "ization", "ize" }, { "isation", "ize" }, { "ation", "ate" }, { "ator", "ate" }, { "alism", "al" }, { "iveness", "ive" }, { "fulness", "ful" }, { "ousness", "ous" }, { "aliti", "al" }, { "iviti", "ive" }, { "biliti", "ble" } }; char [] stem = new char[BIG_KEYWORDSIZE]; int index; for ( index = 0 ; index < suffixes.length ; index++ ) { if ( hasSuffix ( text, suffixes[index][0], stem ) ) { if ( measure ( String.valueOf(stem )) > 0 ) { text = toString(stem ) + suffixes[index][1]; return text; } } } return text; } // step2() static public String step3 ( String text ) { String [][] suffixes = { { "icate", "ic" }, { "ative", "" }, { "alize", "al" }, { "alise", "al" }, { "iciti", "ic" }, { "ical", "ic" }, { "ful", "" }, { "ness", "" }}; char [] stem = new char[BIG_KEYWORDSIZE]; int index; for ( index = 0 ; index < suffixes.length ; index++ ) { if ( hasSuffix ( text, suffixes[index][0], stem ) ) if ( measure ( String.valueOf(stem )) > 0 ) { text = toString(stem) + suffixes[index][1]; return text; } } return text; } // step3 static public String step4 ( String text ) { String [] suffixes = { "al", "ance", "ence", "er", "ic", "able", "ible", "ant", "ement", "ment", "ent", "sion", "tion", "ou", "ism", "ate", "iti", "ous", "ive", "ize", "ise"}; char [] stem = new char[KEYWORDSIZE]; int index; for ( index = 0 ; index < suffixes.length; index++ ) { if ( hasSuffix ( text, suffixes[index], stem ) ) if ( measure ( String.valueOf(stem )) > 1 ) { text = toString(stem); return text; } } return text; } // step4() static public String step5 ( String text ) { if ( last(text) == 'e' ) { if ( measure(text) > 1 ) /* measure(text)==measure(stem) if ends in vowel */ text = removeEnd(text, 1); else if ( measure(text) == 1 ) { String stem = removeEnd(text, 1); if ( !cvc(stem) ) text = removeEnd(text, 1); } } if ( (last(text) == 'l') && (text.charAt(text.length() - 2) == 'l') && (measure(text) > 1) ) text = removeEnd(text, 1); return text; } // step5() // returns the last character in a string static public char last(String text) { return text.charAt(text.length() - 1); } // returns the size of the character array without the spaces static public int realSize(char [] text) { int i = 0; while (i < text.length && text[i] != '\0' && (int)(text[i]) > 32) i++; return i; } static public String toString(char [] text) { String result = ""; int size = realSize(text); for (int i = 0; i < size; i++) result += text[i]; return result; } // returns the text without chars characters at the end static public String removeEnd(String text, int chars) { return text.substring(0, text.length() - chars); } // checks if the word has a given suffix static public Boolean hasSuffix ( String word, String suffix, char [] stem) { if (!word.contains(suffix) || word.indexOf(suffix) != word.length() - suffix.length()) return false; if (suffix.length() > 1 && last(word) != last(suffix)) return false; copyArray(word.substring(0, word.length() - suffix.length()), stem); stem[word.length() - suffix.length()] = '\0'; return true; } // hasSuffix() static public void copyArray(String source, char [] dest) { int i = 0; for (char ch: source.toCharArray()) { dest[i] = ch; i++; } dest[i] = '\0'; } static public Boolean cvc ( String text ) { int length = text.length(); if ( length < 3 ) return false; if ( !vowel(text.charAt(length-1), text.charAt(length-2)) && (text.charAt(length-1) != 'w') && (text.charAt(length-1) != 'x') && (text.charAt(length-1) != 'y') && (vowel(text.charAt(length-2), text.charAt(length-3))) && ( ( length == 3 && !vowel(text.charAt(0),'a') ) || (!vowel(text.charAt(length-3),text.charAt(length-4))))) return true; else return false; } // cvc() // returns true if the character is a vowel. It needs the previous // character because y is considered a vowel only when preceded by a // vowel. static public Boolean vowel ( char ch, char prev ) { switch ( ch ) { case 'a': case 'e': case 'i': case 'o': case 'u': return true; case 'y': return vowel(prev,'?') ; default : return false; } } // vowel() static public int measure(char [] stem) { return measure(String.valueOf(stem)); } static public int measure ( String stem ) { int i = 0, count = 0; int length = stem.length(); while ( i < length ) { for ( ; i < length ; i++ ) { if ( i > 0 ) { if ( vowel(stem.charAt(i), stem.charAt(i-1)) ) break; } else { if ( vowel(stem.charAt(i),'a') ) break; } } for ( i++ ; i < length ; i++ ) { if ( i > 0 ) { if ( !vowel(stem.charAt(i), stem.charAt(i-1)) ) break; } else { if ( !vowel(stem.charAt(i),'?') ) break; } } if ( i < length ) { count++; i++; } } return count; } // measure() // checks if the word contains a vowel static public Boolean containsVowel ( String word ) { int i; for ( i = 0 ; i < word.length() ; i++ ) if ( i > 0 ) { if ( vowel(word.charAt(i), word.charAt(i-1)) ) return true; } else { if ( vowel(word.charAt(0), 'a') ) return true; } return false; } // containsVowel() }