Recently I spent fair amount of time researching issues with Unicode. More than often copy-pasting from Word document would result in some corrupted text after processing the form. What I found is that Word replaces some input with Unicode characters. Em-dash, ellipsis, trademark, copyright, and some quotation marks for example.
I wrote a method to convert Unicode punctuation characters to the Basic Latin (a.k.a. ASCII). It’s not absolute and ultimate tool by any means, so you are welcome to extend it with your own cases.
I used Unicode charts with HEX values found on the official Unicode site. At the end of every sheet you can find substitution examples for most common characters. The idea was to replace these offensive characters with ASCII equivalent. For example ellipsis would become "...", em-dash - "--", etc.
/// <summary>
/// This method converts common punctuation in the supplied string
/// to its equivalent in Basic Latin (ASCII).
/// See Unicode charts http://unicode.org/charts/ for more info
/// </summary>
/// <param name="input">String to convert</param>
/// <returns>String converted to basic latin with Unicode punctuation replaced.</returns>
public string ConvertToBasicLatin(string input)
{
//Replace combined characters
//Ellipsis
input = input.Replace(((char)0x2026).ToString(), "...");
input = input.Replace(((char)0x2025).ToString(), "..");
//Em-dash
input = input.Replace(((char)0x2014).ToString(), "--");
//Fractures
input = input.Replace(((char)0x00BC).ToString(), "1/4");
input = input.Replace(((char)0x00BD).ToString(), "1/2");
input = input.Replace(((char)0x00BE).ToString(), "3/4");
input = input.Replace(((char)0x2153).ToString(), "1/3");
input = input.Replace(((char)0x2154).ToString(), "2/3");
input = input.Replace(((char)0x2155).ToString(), "1/5");
input = input.Replace(((char)0x2156).ToString(), "2/5");
input = input.Replace(((char)0x2158).ToString(), "4/5");
input = input.Replace(((char)0x2159).ToString(), "1/6");
input = input.Replace(((char)0x215A).ToString(), "5/6");
input = input.Replace(((char)0x215B).ToString(), "1/8");
input = input.Replace(((char)0x215C).ToString(), "3/8");
input = input.Replace(((char)0x215D).ToString(), "5/8");
input = input.Replace(((char)0x215E).ToString(), "7/8");
input = input.Replace(((char)0x215F).ToString(), "1/");
//Exclamation - Question mark
input = input.Replace(((char)0x2048).ToString(), "?!");
input = input.Replace(((char)0x2049).ToString(), "!?");
//Copyrights
input = input.Replace(((char)0x00A9).ToString(), "(C)");
//Trademarks
input = input.Replace(((char)0x00AE).ToString(), "(R)");
input = input.Replace(((char)0x2120).ToString(), "(SM)");
input = input.Replace(((char)0x2122).ToString(), "(TM)");
//Array with replacament values for single characters
int[][] unicodeMatrix = new int[][]
{
//Spaces
new int[2] {0x00A0,0x0020},
new int[2] {0x200B,0x0020},
new int[2] {0x2060,0x0020},
new int[2] {0x3000,0x0020},
new int[2] {0xFEFF,0x0020},
// Exclamation
new int[2] {0x00A1,0x0021},
new int[2] {0x01C3,0x0021},
new int[2] {0x203C,0x0021},
new int[2] {0x203D,0x0021},
new int[2] {0x2762,0x0021},
//Quotation
new int[2] {0x02BA,0x0022},
new int[2] {0x02DD,0x0022},
new int[2] {0x02EE,0x0022},
new int[2] {0x02F5,0x0022},
new int[2] {0x02F6,0x0022},
new int[2] {0x030B,0x0022},
new int[2] {0x030E,0x0022},
new int[2] {0x2033,0x0022},
new int[2] {0x2036,0x0022},
new int[2] {0x3003,0x0022},
new int[2] {0x00AB,0x0022},
new int[2] {0x00BB,0x0022},
new int[2] {0x201C,0x0022},
new int[2] {0x201D,0x0022},
new int[2] {0x201E,0x0022},
new int[2] {0x201F,0x0022},
//# sign
new int[2] {0x2114,0x0023},
new int[2] {0x266F,0x0023},
//% sign
new int[2] {0x066A,0x0025},
new int[2] {0x2030,0x0025},
new int[2] {0x2031,0x0025},
new int[2] {0x2052,0x0025},
//Single quote
new int[2] {0x2018,0x0027},
new int[2] {0x2019,0x0027},
new int[2] {0x201A,0x0022},
new int[2] {0x201B,0x0022},
new int[2] {0x02B9,0x0027},
new int[2] {0x02BB,0x0027},
new int[2] {0x02BC,0x0027},
new int[2] {0x02BD,0x0027},
new int[2] {0x02CA,0x0027},
new int[2] {0x02CB,0x0027},
new int[2] {0x02C8,0x0027},
new int[2] {0x0301,0x0027},
new int[2] {0x2032,0x0027},
new int[2] {0xA78C,0x0027},
new int[2] {0x0060,0x0027},
new int[2] {0x02CB,0x0027},
new int[2] {0x0300,0x0027},
new int[2] {0x2035,0x0027},
new int[2] {0x00B4,0x0027},
//Undertscore
new int[2] {0x02CD,0x005F},
new int[2] {0x0331,0x005F},
new int[2] {0x0332,0x005F},
new int[2] {0x2017,0x005F},
//Hyphen
new int[2] {0x2010,0x002D},
new int[2] {0x2011,0x002D},
new int[2] {0x2012,0x002D},
new int[2] {0x2013,0x002D},
new int[2] {0x2212,0x002D},
new int[2] {0x10191,0x002D},
// Less than
new int[2] {0x2039,0x003C},
new int[2] {0x2329,0x003C},
new int[2] {0x27E8,0x003C},
new int[2] {0x3008,0x003C},
// Greater than
new int[2] {0x203A,0x003E},
new int[2] {0x232A,0x003E},
new int[2] {0x27E9,0x003E},
new int[2] {0x3009,0x003E},
//Question mark
new int[2] {0x00BF,0x003F},
new int[2] {0x037E,0x003F},
new int[2] {0x061E,0x003F},
new int[2] {0x203D,0x003F},
//^ accent
new int[2] {0x02C4,0x005E},
new int[2] {0x02C6,0x005E},
new int[2] {0x0302,0x005E},
new int[2] {0x2038,0x005E},
new int[2] {0x2303,0x005E},
//Pipe sign
new int[2] {0x01C0,0x007C},
new int[2] {0x05C0,0x007C},
new int[2] {0x2223,0x007C},
new int[2] {0x2758,0x007C},
//Tilde
new int[2] {0x02DC,0x007E},
new int[2] {0x0303,0x007E},
new int[2] {0x2053,0x007E},
new int[2] {0x223C,0x007E},
new int[2] {0xFF5E,0x007E},
//Asterisk
new int[2] {0x066D,0x002A},
new int[2] {0x204E,0x002A},
new int[2] {0x2217,0x002A},
new int[2] {0x26B9,0x002A},
new int[2] {0x2731,0x002A},
//Bullets
new int[2] {0x00B7,0x002A},
new int[2] {0x0387,0x002A},
new int[2] {0x2022,0x002A},
new int[2] {0x2024,0x002A},
new int[2] {0x2027,0x002A},
new int[2] {0x2219,0x002A},
new int[2] {0x22C5,0x002A},
new int[2] {0x30FB,0x002A}
};
//Replace single characters
for (int i = 0; i < unicodeMatrix.Length; i++)
{
input = input.Replace((char)unicodeMatrix[i][0], (char)unicodeMatrix[i][1]);
}
//Filter out all remaining non-ASCII characters
RegexOptions regexOpts = RegexOptions.IgnoreCase & RegexOptions.Multiline;
Regex regex = new Regex("[^\x20-\x7E]", regexOpts);
return regex.Replace(input, string.Empty);
}