retriving escape unicode sequences from files ...

Discussion in 'Java' started by qwertmonkey@syberianoutpost.ru, Aug 4, 2012.

  1. Guest

    Arne,
    ~
    I would use your pattern matcher but instead of
    "Character.toString((char)Integer.parseInt" ... stuff, I would use a look-up
    table
    ~
    Here is the outline of my code:
    ~
    // __
    private HashMap<String, Integer> HMHex2Int;
    // __
    private final String aRegXPtrn = "\\\\u([0-9a-f]{4})";
    private final Pattern UKdRegX = Pattern.compile(aRegXPtrn,
    Pattern.CASE_INSENSITIVE);
    // __
    private final String[] aHex2ByteTbl = new String[]{
    "00", "01", "02", "03", "04", "05", "06", "07",
    "08", "09", "0a", "0b", "0c", "0d", "0e", "0f",
    "10", "11", "12", "13", "14", "15", "16", "17",
    "18", "19", "1a", "1b", "1c", "1d", "1e", "1f",
    "20", "21", "22", "23", "24", "25", "26", "27",
    "28", "29", "2a", "2b", "2c", "2d", "2e", "2f",
    "30", "31", "32", "33", "34", "35", "36", "37",
    "38", "39", "3a", "3b", "3c", "3d", "3e", "3f",
    "40", "41", "42", "43", "44", "45", "46", "47",
    "48", "49", "4a", "4b", "4c", "4d", "4e", "4f",
    "50", "51", "52", "53", "54", "55", "56", "57",
    "58", "59", "5a", "5b", "5c", "5d", "5e", "5f",
    "60", "61", "62", "63", "64", "65", "66", "67",
    "68", "69", "6a", "6b", "6c", "6d", "6e", "6f",
    "70", "71", "72", "73", "74", "75", "76", "77",
    "78", "79", "7a", "7b", "7c", "7d", "7e", "7f",
    "80", "81", "82", "83", "84", "85", "86", "87",
    "88", "89", "8a", "8b", "8c", "8d", "8e", "8f",
    "90", "91", "92", "93", "94", "95", "96", "97",
    "98", "99", "9a", "9b", "9c", "9d", "9e", "9f",
    "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7",
    "a8", "a9", "aa", "ab", "ac", "ad", "ae", "af",
    "b0", "b1", "b2", "b3", "b4", "b5", "b6", "b7",
    "b8", "b9", "ba", "bb", "bc", "bd", "be", "bf",
    "c0", "c1", "c2", "c3", "c4", "c5", "c6", "c7",
    "c8", "c9", "ca", "cb", "cc", "cd", "ce", "cf",
    "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
    "d8", "d9", "da", "db", "dc", "dd", "de", "df",
    "e0", "e1", "e2", "e3", "e4", "e5", "e6", "e7",
    "e8", "e9", "ea", "eb", "ec", "ed", "ee", "ef",
    "f0", "f1", "f2", "f3", "f4", "f5", "f6", "f7",
    "f8", "f9", "fa", "fb", "fc", "fd", "fe", "ff"
    };
    ~
    // __ ctor
    ~
    HMHex2Int = new HashMap<String, Integer>();
    for(int i = 0; (i < aHex2ByteTbl.length); ++i){ HMHex2Int.put(aHex2ByteTbl, HMHex2Int.size()); }
    ~
    then:
    ~
    // __ converts from \(\)u#### (front slash u sequences not turn to strings by the compiler) to unikd
    public String unescapeHex2String(String aFSU) throws UnsupportedEncodingException{
    StringBuilder aBldr = null;
    // __
    int iFSUL;
    if((aFSU != null) && ((iFSUL = aFSU.length()) > 0)){
    int[] iHex = new int[2];
    int iHexArL = iHex.length;
    String aUKdS;
    aBldr = new StringBuilder();
    // __
    Matcher UKdRegXMtx = UKdRegX.matcher(aFSU);
    // __
    while (UKdRegXMtx.find()){
    aUKdS = aFSU.substring((UKdRegXMtx.start() + 2), UKdRegXMtx.end());
    // __
    for(int j = 0; (j < iHexArL); ++j){ iHex[j] = HMHex2Byte.get(aUKdS.substring(2*j, 2*(j + 1)).toLowerCase()).intValue(); }// j [0, iHexArL)
    // __
    aBldr.append((char)(16*iHex[0] + iHex[1]));
    }
    }// ((aFSU != null) && ((iFSUL = aFSU.length()) > 0))
    // __
    return(aBldr.toString());
    }
    ~
    lbrtchx
    , Aug 4, 2012
    #1
    1. Advertising

  2. Arne Vajhøj Guest

    On 8/4/2012 1:47 PM, wrote:
    > ~
    > I would use your pattern matcher but instead of
    > "Character.toString((char)Integer.parseInt" ... stuff, I would use a look-up
    > table
    > ~


    You could.

    But I am not sure that it is practical.

    > Here is the outline of my code:
    > ~
    > // __
    > private HashMap<String, Integer> HMHex2Int;
    > // __
    > private final String aRegXPtrn = "\\\\u([0-9a-f]{4})";
    > private final Pattern UKdRegX = Pattern.compile(aRegXPtrn,
    > Pattern.CASE_INSENSITIVE);
    > // __
    > private final String[] aHex2ByteTbl = new String[]{
    > "00", "01", "02", "03", "04", "05", "06", "07",
    > "08", "09", "0a", "0b", "0c", "0d", "0e", "0f",
    > "10", "11", "12", "13", "14", "15", "16", "17",
    > "18", "19", "1a", "1b", "1c", "1d", "1e", "1f",
    > "20", "21", "22", "23", "24", "25", "26", "27",
    > "28", "29", "2a", "2b", "2c", "2d", "2e", "2f",
    > "30", "31", "32", "33", "34", "35", "36", "37",
    > "38", "39", "3a", "3b", "3c", "3d", "3e", "3f",
    > "40", "41", "42", "43", "44", "45", "46", "47",
    > "48", "49", "4a", "4b", "4c", "4d", "4e", "4f",
    > "50", "51", "52", "53", "54", "55", "56", "57",
    > "58", "59", "5a", "5b", "5c", "5d", "5e", "5f",
    > "60", "61", "62", "63", "64", "65", "66", "67",
    > "68", "69", "6a", "6b", "6c", "6d", "6e", "6f",
    > "70", "71", "72", "73", "74", "75", "76", "77",
    > "78", "79", "7a", "7b", "7c", "7d", "7e", "7f",
    > "80", "81", "82", "83", "84", "85", "86", "87",
    > "88", "89", "8a", "8b", "8c", "8d", "8e", "8f",
    > "90", "91", "92", "93", "94", "95", "96", "97",
    > "98", "99", "9a", "9b", "9c", "9d", "9e", "9f",
    > "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7",
    > "a8", "a9", "aa", "ab", "ac", "ad", "ae", "af",
    > "b0", "b1", "b2", "b3", "b4", "b5", "b6", "b7",
    > "b8", "b9", "ba", "bb", "bc", "bd", "be", "bf",
    > "c0", "c1", "c2", "c3", "c4", "c5", "c6", "c7",
    > "c8", "c9", "ca", "cb", "cc", "cd", "ce", "cf",
    > "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
    > "d8", "d9", "da", "db", "dc", "dd", "de", "df",
    > "e0", "e1", "e2", "e3", "e4", "e5", "e6", "e7",
    > "e8", "e9", "ea", "eb", "ec", "ed", "ee", "ef",
    > "f0", "f1", "f2", "f3", "f4", "f5", "f6", "f7",
    > "f8", "f9", "fa", "fb", "fc", "fd", "fe", "ff"
    > };
    > ~
    > // __ ctor
    > ~
    > HMHex2Int = new HashMap<String, Integer>();
    > for(int i = 0; (i < aHex2ByteTbl.length); ++i){ HMHex2Int.put(aHex2ByteTbl, HMHex2Int.size()); }
    > ~
    > then:
    > ~
    > // __ converts from \(\)u#### (front slash u sequences not turn to strings by the compiler) to unikd
    > public String unescapeHex2String(String aFSU) throws UnsupportedEncodingException{
    > StringBuilder aBldr = null;
    > // __
    > int iFSUL;
    > if((aFSU != null) && ((iFSUL = aFSU.length()) > 0)){
    > int[] iHex = new int[2];
    > int iHexArL = iHex.length;
    > String aUKdS;
    > aBldr = new StringBuilder();
    > // __
    > Matcher UKdRegXMtx = UKdRegX.matcher(aFSU);
    > // __
    > while (UKdRegXMtx.find()){
    > aUKdS = aFSU.substring((UKdRegXMtx.start() + 2), UKdRegXMtx.end());
    > // __
    > for(int j = 0; (j < iHexArL); ++j){ iHex[j] = HMHex2Byte.get(aUKdS.substring(2*j, 2*(j + 1)).toLowerCase()).intValue(); }// j [0, iHexArL)
    > // __
    > aBldr.append((char)(16*iHex[0] + iHex[1]));
    > }
    > }// ((aFSU != null) && ((iFSUL = aFSU.length()) > 0))
    > // __
    > return(aBldr.toString());
    > }
    > ~


    But:
    1) the code is difficult to read
    2) HMHex2Byte is not declared - it probably is HMHex2Int
    3) it seems as if you lookup 4 bit values in an 8 bit table??
    4) the code does not handle code points >255

    Arne
    Arne Vajhøj, Aug 7, 2012
    #2
    1. Advertising

Want to reply to this thread or ask your own question?

It takes just 2 minutes to sign up (and it's free!). Just click the sign up button to choose a username and then you can ask your own questions on the forum.
Similar Threads
  1. Replies:
    8
    Views:
    313
    Arne Vajhøj
    Aug 7, 2012
  2. Replies:
    0
    Views:
    112
  3. qwertmonkey
    Replies:
    8
    Views:
    254
    Arne Vajhøj
    Aug 8, 2012
  4. qwertmonkey
    Replies:
    0
    Views:
    182
    qwertmonkey
    Aug 3, 2012
  5. Replies:
    0
    Views:
    168
Loading...

Share This Page