How to create python codecs?

Discussion in 'Python' started by yrogirg@gmail.com, Aug 6, 2008.

  1. Guest

    Actually, I need utf-8 to utf-8 encoding which would change the text
    to another keyboard layout (e.g. from english to russian ghbdtn ->
    ÐÒÉ×ÅÔ) and would not affect other symbols.

    I`m totally new to python and to more or less advanced programming. I
    couldn`t find the answer to the question anywhere.

    I`ve tried create simple utf to utf codec for some symbols but it
    doesn`t work. Here it is.



    import codecs

    ### Codec APIs

    class Codec(codecs.Codec):

    def encode(self,input,errors='strict'):
    return codecs.charmap_encode(input,errors,encoding_table)

    def decode(self,input,errors='strict'):
    return codecs.charmap_decode(input,errors,decoding_table)

    class IncrementalEncoder(codecs.IncrementalEncoder):
    def encode(self, input, final=False):
    return codecs.charmap_encode(input,self.errors,encoding_table)
    [0]

    class IncrementalDecoder(codecs.IncrementalDecoder):
    def decode(self, input, final=False):
    return codecs.charmap_decode(input,self.errors,decoding_table)
    [0]

    class StreamWriter(Codec,codecs.StreamWriter):
    pass

    class StreamReader(Codec,codecs.StreamReader):
    pass

    ### encodings module API

    def getregentry():
    return codecs.CodecInfo(
    name='rulayout',
    encode=Codec().encode,
    decode=Codec().decode,
    incrementalencoder=IncrementalEncoder,
    incrementaldecoder=IncrementalDecoder,
    streamreader=StreamReader,
    streamwriter=StreamWriter,
    )


    ### Decoding Table

    decoding_table = (
    u'\u0000' # u'\u0000' -> NULL
    u'\u0001' # u'\u0001' -> START OF HEADING
    u'\u0002' # u'\u0002' -> START OF TEXT
    u'\u0003' # u'\u0003' -> END OF TEXT
    u'\u0004' # u'\u0004' -> END OF TRANSMISSION
    u'\u0005' # u'\u0005' -> ENQUIRY
    u'\u0006' # u'\u0006' -> ACKNOWLEDGE
    u'\u0007' # u'\u0007' -> BELL
    u'\u0008' # u'\u0008' -> BACKSPACE
    u'\u0009' # u'\u0009' -> HORIZONTAL TABULATION
    u'\u000A' # u'\u000A' -> LINE FEED
    u'\u000B' # u'\u000B' -> VERTICAL TABULATION
    u'\u000C' # u'\u000C' -> FORM FEED
    u'\u000D' # u'\u000D' -> CARRIAGE RETURN
    u'\u000E' # u'\u000E' -> SHIFT OUT
    u'\u000F' # u'\u000F' -> SHIFT IN
    u'\u0010' # u'\u0010' -> DATA LINK ESCAPE
    u'\u0011' # u'\u0011' -> DEVICE CONTROL ONE
    u'\u0012' # u'\u0012' -> DEVICE CONTROL TWO
    u'\u0013' # u'\u0013' -> DEVICE CONTROL THREE
    u'\u0014' # u'\u0014' -> DEVICE CONTROL FOUR
    u'\u0015' # u'\u0015' -> NEGATIVE ACKNOWLEDGE
    u'\u0016' # u'\u0016' -> SYNCHRONOUS IDLE
    u'\u0017' # u'\u0017' -> END OF TRANSMISSION BLOCK
    u'\u0018' # u'\u0018' -> CANCEL
    u'\u0019' # u'\u0019' -> END OF MEDIUM
    u'\u001A' # u'\u001A' -> SUBSTITUTE
    u'\u001B' # u'\u001B' -> ESCAPE
    u'\u001C' # u'\u001C' -> FILE SEPARATOR
    u'\u001D' # u'\u001D' -> GROUP SEPARATOR
    u'\u001E' # u'\u001E' -> RECORD SEPARATOR
    u'\u001F' # u'\u001F' -> UNIT SEPARATOR
    u'\u0020' # u'\u0020' -> SPACE
    u'\u0021' # u'\u0021' -> EXCLAMATION MARK
    u'\u0022' # u'\u0022' -> QUOTATION MARK
    u'\u0023' # u'\u0023' -> NUMBER SIGN
    u'\u0024' # u'\u0024' -> DOLLAR SIGN
    u'\u0025' # u'\u0025' -> PERCENT SIGN
    u'\u0026' # u'\u0026' -> AMPERSAND
    u'\u0027' # u'\u0027' -> APOSTROPHE
    u'\u0028' # u'\u0028' -> LEFT PARENTHESIS
    u'\u0029' # u'\u0029' -> RIGHT PARENTHESIS
    u'\u002A' # u'\u002A' -> ASTERISK
    u'\u002B' # u'\u002B' -> PLUS SIGN
    u'\u002C' # u'\u002C' -> COMMA
    u'\u002D' # u'\u002D' -> HYPHEN-MINUS
    u'\u002E' # u'\u002E' -> FULL STOP
    u'\u002F' # u'\u002F' -> SOLIDUS
    u'\u0030' # u'\u0030' -> DIGIT ZERO
    u'\u0031' # u'\u0031' -> DIGIT ONE
    u'\u0032' # u'\u0032' -> DIGIT TWO
    u'\u0033' # u'\u0033' -> DIGIT THREE
    u'\u0034' # u'\u0034' -> DIGIT FOUR
    u'\u0035' # u'\u0035' -> DIGIT FIVE
    u'\u0036' # u'\u0036' -> DIGIT SIX
    u'\u0037' # u'\u0037' -> DIGIT SEVEN
    u'\u0038' # u'\u0038' -> DIGIT EIGHT
    u'\u0039' # u'\u0039' -> DIGIT NINE
    u'\u003A' # u'\u003A' -> COLON
    u'\u003B' # u'\u003B' -> SEMICOLON
    u'\u003C' # u'\u003C' -> LESS-THAN SIGN
    u'\u003D' # u'\u003D' -> EQUALS SIGN
    u'\u003E' # u'\u003E' -> GREATER-THAN SIGN
    u'\u003F' # u'\u003F' -> QUESTION MARK
    u'\u0040' # u'\u0040' -> COMMERCIAL AT
    u'\u0041' # u'\u0041' -> LATIN CAPITAL LETTER A
    u'\u0042' # u'\u0042' -> LATIN CAPITAL LETTER B
    u'\u0043' # u'\u0043' -> LATIN CAPITAL LETTER C
    u'\u0044' # u'\u0044' -> LATIN CAPITAL LETTER D
    u'\u0045' # u'\u0045' -> LATIN CAPITAL LETTER E
    u'\u0046' # u'\u0046' -> LATIN CAPITAL LETTER F
    u'\u0047' # u'\u0047' -> LATIN CAPITAL LETTER G
    u'\u0048' # u'\u0048' -> LATIN CAPITAL LETTER H
    u'\u0049' # u'\u0049' -> LATIN CAPITAL LETTER I
    u'\u004A' # u'\u004A' -> LATIN CAPITAL LETTER J
    u'\u004B' # u'\u004B' -> LATIN CAPITAL LETTER K
    u'\u004C' # u'\u004C' -> LATIN CAPITAL LETTER L
    u'\u004D' # u'\u004D' -> LATIN CAPITAL LETTER M
    u'\u004E' # u'\u004E' -> LATIN CAPITAL LETTER N
    u'\u004F' # u'\u004F' -> LATIN CAPITAL LETTER O
    u'\u0050' # u'\u0050' -> LATIN CAPITAL LETTER P
    u'\u0051' # u'\u0051' -> LATIN CAPITAL LETTER Q
    u'\u0052' # u'\u0052' -> LATIN CAPITAL LETTER R
    u'\u0053' # u'\u0053' -> LATIN CAPITAL LETTER S
    u'\u0054' # u'\u0054' -> LATIN CAPITAL LETTER T
    u'\u0055' # u'\u0055' -> LATIN CAPITAL LETTER U
    u'\u0056' # u'\u0056' -> LATIN CAPITAL LETTER V
    u'\u0057' # u'\u0057' -> LATIN CAPITAL LETTER W
    u'\u0058' # u'\u0058' -> LATIN CAPITAL LETTER X
    u'\u0059' # u'\u0059' -> LATIN CAPITAL LETTER Y
    u'\u005A' # u'\u005A' -> LATIN CAPITAL LETTER Z
    u'\u005B' # u'\u005B' -> LEFT SQUARE BRACKET
    u'\u005C' # u'\u005C' -> REVERSE SOLIDUS
    u'\u005D' # u'\u005D' -> RIGHT SQUARE BRACKET
    u'\u005E' # u'\u005E' -> CIRCUMFLEX ACCENT
    u'\u005F' # u'\u005F' -> LOW LINE
    u'\u0060' # u'\u0060' -> GRAVE ACCENT
    u'\u0061' # u'\u0061' -> LATIN SMALL LETTER A
    u'\u0062' # u'\u0062' -> LATIN SMALL LETTER B
    u'\u0063' # u'\u0063' -> LATIN SMALL LETTER C
    u'\u0064' # u'\u0064' -> LATIN SMALL LETTER D
    u'\u0065' # u'\u0065' -> LATIN SMALL LETTER E
    u'\u0066' # u'\u0066' -> LATIN SMALL LETTER F
    u'\u0067' # u'\u0067' -> LATIN SMALL LETTER G
    u'\u0068' # u'\u0068' -> LATIN SMALL LETTER H
    u'\u0069' # u'\u0069' -> LATIN SMALL LETTER I
    u'\u006A' # u'\u006A' -> LATIN SMALL LETTER J
    u'\u006B' # u'\u006B' -> LATIN SMALL LETTER K
    u'\u006C' # u'\u006C' -> LATIN SMALL LETTER L
    u'\u006D' # u'\u006D' -> LATIN SMALL LETTER M
    u'\u006E' # u'\u006E' -> LATIN SMALL LETTER N
    u'\u006F' # u'\u006F' -> LATIN SMALL LETTER O
    u'\u0070' # u'\u0070' -> LATIN SMALL LETTER P
    u'\u0071' # u'\u0071' -> LATIN SMALL LETTER Q
    u'\u0072' # u'\u0072' -> LATIN SMALL LETTER R
    u'\u0073' # u'\u0073' -> LATIN SMALL LETTER S
    u'\u0074' # u'\u0074' -> LATIN SMALL LETTER T
    u'\u0075' # u'\u0075' -> LATIN SMALL LETTER U
    u'\u0076' # u'\u0076' -> LATIN SMALL LETTER V
    u'\u0077' # u'\u0077' -> LATIN SMALL LETTER W
    u'\u0078' # u'\u0078' -> LATIN SMALL LETTER X
    u'\u0079' # u'\u0079' -> LATIN SMALL LETTER Y
    u'\u007A' # u'\u007A' -> LATIN SMALL LETTER Z
    u'\u007B' # u'\u007B' -> LEFT CURLY BRACKET
    u'\u007C' # u'\u007C' -> VERTICAL LINE
    u'\u007D' # u'\u007D' -> RIGHT CURLY BRACKET
    u'\u007E' # u'\u007E' -> TILDE
    u'\u007F' # u'\u007F' -> DELETE
    u'\u0080' # u'\u0080' -> <control>
    u'\u0081' # u'\u0081' -> <control>
    u'\u0082' # u'\u0082' -> <control>
    u'\u0083' # u'\u0083' -> <control>
    u'\u0084' # u'\u0084' -> <control>
    u'\u0085' # u'\u0085' -> <control>
    u'\u0086' # u'\u0086' -> <control>
    u'\u0087' # u'\u0087' -> <control>
    u'\u0088' # u'\u0088' -> <control>
    u'\u0089' # u'\u0089' -> <control>
    u'\u008A' # u'\u008A' -> <control>
    u'\u008B' # u'\u008B' -> <control>
    u'\u008C' # u'\u008C' -> <control>
    u'\u008D' # u'\u008D' -> <control>
    u'\u008E' # u'\u008E' -> <control>
    u'\u008F' # u'\u008F' -> <control>
    u'\u0090' # u'\u0090' -> <control>
    u'\u0091' # u'\u0091' -> <control>
    u'\u0092' # u'\u0092' -> <control>
    u'\u0093' # u'\u0093' -> <control>
    u'\u0094' # u'\u0094' -> <control>
    u'\u0095' # u'\u0095' -> <control>
    u'\u0096' # u'\u0096' -> <control>
    u'\u0097' # u'\u0097' -> <control>
    u'\u0098' # u'\u0098' -> <control>
    u'\u0099' # u'\u0099' -> <control>
    u'\u009A' # u'\u009A' -> <control>
    u'\u009B' # u'\u009B' -> <control>
    u'\u009C' # u'\u009C' -> <control>
    u'\u009D' # u'\u009D' -> <control>
    u'\u009E' # u'\u009E' -> <control>
    u'\u009F' # u'\u009F' -> <control>
    u'\u00A0' # u'\u00A0' -> NO-BREAK SPACE
    u'\u0401' # u'\u0401' -> CYRILLIC CAPITAL LETTER IO
    u'\u0402' # u'\u0402' -> CYRILLIC CAPITAL LETTER DJE
    u'\u0403' # u'\u0403' -> CYRILLIC CAPITAL LETTER GJE
    u'\u0404' # u'\u0404' -> CYRILLIC CAPITAL LETTER UKRAINIAN IE
    u'\u0405' # u'\u0405' -> CYRILLIC CAPITAL LETTER DZE
    u'\u0406' # u'\u0406' -> CYRILLIC CAPITAL LETTER BYELORUSSIAN-
    UKRAINIAN I
    u'\u0407' # u'\u0407' -> CYRILLIC CAPITAL LETTER YI
    u'\u0408' # u'\u0408' -> CYRILLIC CAPITAL LETTER JE
    u'\u0409' # u'\u0409' -> CYRILLIC CAPITAL LETTER LJE
    u'\u040A' # u'\u040A' -> CYRILLIC CAPITAL LETTER NJE
    u'\u040B' # u'\u040B' -> CYRILLIC CAPITAL LETTER TSHE
    u'\u040C' # u'\u040C' -> CYRILLIC CAPITAL LETTER KJE
    u'\u00AD' # u'\u00AD' -> SOFT HYPHEN
    u'\u040E' # u'\u040E' -> CYRILLIC CAPITAL LETTER SHORT U
    u'\u040F' # u'\u040F' -> CYRILLIC CAPITAL LETTER DZHE
    u'\u0410' # u'\u0410' -> CYRILLIC CAPITAL LETTER A
    u'\u0411' # u'\u0411' -> CYRILLIC CAPITAL LETTER BE
    u'\u0412' # u'\u0412' -> CYRILLIC CAPITAL LETTER VE
    u'\u0413' # u'\u0413' -> CYRILLIC CAPITAL LETTER GHE
    u'\u0414' # u'\u0414' -> CYRILLIC CAPITAL LETTER DE
    u'\u0415' # u'\u0415' -> CYRILLIC CAPITAL LETTER IE
    u'\u0416' # u'\u0416' -> CYRILLIC CAPITAL LETTER ZHE
    u'\u0417' # u'\u0417' -> CYRILLIC CAPITAL LETTER ZE
    u'\u0418' # u'\u0418' -> CYRILLIC CAPITAL LETTER I
    u'\u0419' # u'\u0419' -> CYRILLIC CAPITAL LETTER SHORT I
    u'\u041A' # u'\u041A' -> CYRILLIC CAPITAL LETTER KA
    u'\u041B' # u'\u041B' -> CYRILLIC CAPITAL LETTER EL
    u'\u041C' # u'\u041C' -> CYRILLIC CAPITAL LETTER EM
    u'\u041D' # u'\u041D' -> CYRILLIC CAPITAL LETTER EN
    u'\u041E' # u'\u041E' -> CYRILLIC CAPITAL LETTER O
    u'\u041F' # u'\u041F' -> CYRILLIC CAPITAL LETTER PE
    u'\u0420' # u'\u0420' -> CYRILLIC CAPITAL LETTER ER
    u'\u0421' # u'\u0421' -> CYRILLIC CAPITAL LETTER ES
    u'\u0422' # u'\u0422' -> CYRILLIC CAPITAL LETTER TE
    u'\u0423' # u'\u0423' -> CYRILLIC CAPITAL LETTER U
    u'\u0424' # u'\u0424' -> CYRILLIC CAPITAL LETTER EF
    u'\u0425' # u'\u0425' -> CYRILLIC CAPITAL LETTER HA
    u'\u0426' # u'\u0426' -> CYRILLIC CAPITAL LETTER TSE
    u'\u0427' # u'\u0427' -> CYRILLIC CAPITAL LETTER CHE
    u'\u0428' # u'\u0428' -> CYRILLIC CAPITAL LETTER SHA
    u'\u0429' # u'\u0429' -> CYRILLIC CAPITAL LETTER SHCHA
    u'\u042A' # u'\u042A' -> CYRILLIC CAPITAL LETTER HARD SIGN
    u'\u042B' # u'\u042B' -> CYRILLIC CAPITAL LETTER YERU
    u'\u042C' # u'\u042C' -> CYRILLIC CAPITAL LETTER SOFT SIGN
    u'\u042D' # u'\u042D' -> CYRILLIC CAPITAL LETTER E
    u'\u042E' # u'\u042E' -> CYRILLIC CAPITAL LETTER YU
    u'\u042F' # u'\u042F' -> CYRILLIC CAPITAL LETTER YA
    u'\u0430' # u'\u0430' -> CYRILLIC SMALL LETTER A
    u'\u0431' # u'\u0431' -> CYRILLIC SMALL LETTER BE
    u'\u0432' # u'\u0432' -> CYRILLIC SMALL LETTER VE
    u'\u0433' # u'\u0433' -> CYRILLIC SMALL LETTER GHE
    u'\u0434' # u'\u0434' -> CYRILLIC SMALL LETTER DE
    u'\u0435' # u'\u0435' -> CYRILLIC SMALL LETTER IE
    u'\u0436' # u'\u0436' -> CYRILLIC SMALL LETTER ZHE
    u'\u0437' # u'\u0437' -> CYRILLIC SMALL LETTER ZE
    u'\u0438' # u'\u0438' -> CYRILLIC SMALL LETTER I
    u'\u0439' # u'\u0439' -> CYRILLIC SMALL LETTER SHORT I
    u'\u043A' # u'\u043A' -> CYRILLIC SMALL LETTER KA
    u'\u043B' # u'\u043B' -> CYRILLIC SMALL LETTER EL
    u'\u043C' # u'\u043C' -> CYRILLIC SMALL LETTER EM
    u'\u043D' # u'\u043D' -> CYRILLIC SMALL LETTER EN
    u'\u043E' # u'\u043E' -> CYRILLIC SMALL LETTER O
    u'\u043F' # u'\u043F' -> CYRILLIC SMALL LETTER PE
    u'\u0440' # u'\u0440' -> CYRILLIC SMALL LETTER ER
    u'\u0441' # u'\u0441' -> CYRILLIC SMALL LETTER ES
    u'\u0442' # u'\u0442' -> CYRILLIC SMALL LETTER TE
    u'\u0443' # u'\u0443' -> CYRILLIC SMALL LETTER U
    u'\u0444' # u'\u0444' -> CYRILLIC SMALL LETTER EF
    u'\u0445' # u'\u0445' -> CYRILLIC SMALL LETTER HA
    u'\u0446' # u'\u0446' -> CYRILLIC SMALL LETTER TSE
    u'\u0447' # u'\u0447' -> CYRILLIC SMALL LETTER CHE
    u'\u0448' # u'\u0448' -> CYRILLIC SMALL LETTER SHA
    u'\u0449' # u'\u0449' -> CYRILLIC SMALL LETTER SHCHA
    u'\u044A' # u'\u044A' -> CYRILLIC SMALL LETTER HARD SIGN
    u'\u044B' # u'\u044B' -> CYRILLIC SMALL LETTER YERU
    u'\u044C' # u'\u044C' -> CYRILLIC SMALL LETTER SOFT SIGN
    u'\u044D' # u'\u044D' -> CYRILLIC SMALL LETTER E
    u'\u044E' # u'\u044E' -> CYRILLIC SMALL LETTER YU
    u'\u044F' # u'\u044F' -> CYRILLIC SMALL LETTER YA
    u'\u2116' # u'\u2116' -> NUMERO SIGN
    u'\u0451' # u'\u0451' -> CYRILLIC SMALL LETTER IO
    u'\u0452' # u'\u0452' -> CYRILLIC SMALL LETTER DJE
    u'\u0453' # u'\u0453' -> CYRILLIC SMALL LETTER GJE
    u'\u0454' # u'\u0454' -> CYRILLIC SMALL LETTER UKRAINIAN IE
    u'\u0455' # u'\u0455' -> CYRILLIC SMALL LETTER DZE
    u'\u0456' # u'\u0456' -> CYRILLIC SMALL LETTER BYELORUSSIAN-
    UKRAINIAN I
    u'\u0457' # u'\u0457' -> CYRILLIC SMALL LETTER YI
    u'\u0458' # u'\u0458' -> CYRILLIC SMALL LETTER JE
    u'\u0459' # u'\u0459' -> CYRILLIC SMALL LETTER LJE
    u'\u045A' # u'\u045A' -> CYRILLIC SMALL LETTER NJE
    u'\u045B' # u'\u045B' -> CYRILLIC SMALL LETTER TSHE
    u'\u045C' # u'\u045C' -> CYRILLIC SMALL LETTER KJE
    u'\u00A7' # u'\u00A7' -> SECTION SIGN
    u'\u045E' # u'\u045E' -> CYRILLIC SMALL LETTER SHORT U
    u'\u045F' # u'\u045F' -> CYRILLIC SMALL LETTER DZHE
    )

    ### Encoding table
    encoding_table=codecs.charmap_build(decoding_table)
    , Aug 6, 2008
    #1
    1. Advertising

Want to reply to this thread or ask your own question?

It takes just 2 minutes to sign up (and it's free!). Just click the sign up button to choose a username and then you can ask your own questions on the forum.
Similar Threads
  1. Radovan Garabik

    how to register private python codecs?

    Radovan Garabik, Jul 1, 2003, in forum: Python
    Replies:
    1
    Views:
    737
    Steven Taschuk
    Jul 1, 2003
  2. Eric Brunel
    Replies:
    3
    Views:
    555
    Richard Brodie
    Jun 28, 2005
  3. Mike Currie

    Python UTF-8 and codecs

    Mike Currie, Jun 27, 2006, in forum: Python
    Replies:
    7
    Views:
    1,141
    Serge Orlov
    Jun 28, 2006
  4. David Hughes
    Replies:
    1
    Views:
    679
    Peter Otten
    Jan 3, 2007
  5. Karl Knechtel
    Replies:
    2
    Views:
    366
    Walter Dörwald
    Jul 10, 2012
Loading...

Share This Page