Warning: this is an htmlized version!
The original is here, and the conversion rules are here. |
####### # # E-scripts on using codepage850, latin1+ and the math-enhanced # versions of these charsets # # Note 1: use the eev command (defined in eev.el) and the # ee alias (in my .zshrc) to execute parts of this file. # Executing this file as a whole makes no sense. # An introduction to eev can be found here: # # (find-eev-quick-intro) # http://angg.twu.net/eev-intros/find-eev-quick-intro.html # # Note 2: be VERY careful and make sure you understand what # you're doing. # # Note 3: If you use a shell other than zsh things like |& # and the for loops may not work. # # Note 4: I always run as root. # # Note 5: some parts are too old and don't work anymore. Some # never worked. # # Note 6: the definitions for the find-xxxfile commands are on my # .emacs. # # Note 7: if you see a strange command check my .zshrc -- it may # be defined there as a function or an alias. # # Note 8: the sections without dates are always older than the # sections with dates. # # This file is at <http://angg.twu.net/e/charsets.e> # or at <http://angg.twu.net/e/charsets.e.html>. # See also <http://angg.twu.net/emacs.html>, # <http://angg.twu.net/.emacs[.html]>, # <http://angg.twu.net/.zshrc[.html]>, # <http://angg.twu.net/escripts.html>, # and <http://angg.twu.net/>. # ####### # «.tcs» (to "tcs") # «.tcs_latin1-850» (to "tcs_latin1-850") # «.tcs_patch» (to "tcs_patch") # «.latin1-850_font» (to "latin1-850_font") # «.page_to_latin1-850» (to "page_to_latin1-850") # «.charset_indicator» (to "charset_indicator") # «.mapscrn» (to "mapscrn") # «.recode» (to "recode") # «.pdftotext» (to "pdftotext") # «.utf-8» (to "utf-8") # «.u8_to_l1» (to "u8_to_l1") # «.l1_to_u8» (to "l1_to_u8") # «.unicode-data» (to "unicode-data") # «.unzip-unicode-problem» (to "unzip-unicode-problem") # «.iconv» (to "iconv") # (find-es "print" "a2ps-cp850") ##### # # tcs (for conversion between charsets) # 2001jan03 # ##### # «tcs» (to ".tcs") # (to "tcs_patch") # (find-status "tcs") # (find-vldifile "tcs.list") # (find-fline "/usr/doc/tcs/") # The .ps is just a ps version of the manpage: #gv /usr/doc/tcs/tcs.ps.gz # (eeman "1 tcs") #* pdsc $SDEBIAN/dists/potato/main/source/text/tcs_1-6.dsc #* tcs -lv |& tee ~/o #* # (find-fline "~/o") # (code-c-d "tcs" "/usr/src/tcs-1/") # (find-tcsfile "tcs.c" "850") # (find-tcsfile "tcs.c" "tabps2[256] =") # (find-tcsfile "tcs.c" "tab8859_1[256] =") # (find-tcsfile "") # (find-tcsfile "regress") # (find-tcsfile "Makefile") # (find-man "7 utf-8") ##### # # tcs: comparing the 850 and latin1 charsets and building latin1-850 # 2001jan03 # ##### # «tcs_latin1-850» (to ".tcs_latin1-850") # (to "tcs") # (to "tcs_patch") # (to "latin1-850_font") # Look for repetitions in the tables that interest me (850 and latin1)... #* # Test "lsort": # (eeman "3tcl lsort") expect -c ' puts [lsort -integer {1 2 3 10 20 10 11}] puts [lsort -integer {1 2 3 10 20 10 11 0x0a}] ' #* # (find-tcsfile "tcs.c" "tabps2[256] =") cat > /tmp/ps2runes0 <<'---' 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f 10 11 12 13 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f 20 21 22 23 24 25 26 27 28 29 2a 2b 2c 2d 2e 2f 30 31 32 33 34 35 36 37 38 39 3a 3b 3c 3d 3e 3f 40 41 42 43 44 45 46 47 48 49 4a 4b 4c 4d 4e 4f 50 51 52 53 54 55 56 57 58 59 5a 5b 5c 5d 5e 5f 60 61 62 63 64 65 66 67 68 69 6a 6b 6c 6d 6e 6f 70 71 72 73 74 75 76 77 78 79 7a 7b 7c 7d 7e 7f 00c7 00fc 00e9 00e2 00e4 00e0 00e5 00e7 00ea 00eb 00e8 00ef 00ee 00ec 00c4 00c5 00c9 00e6 00c6 00f4 00f6 00f2 00fb 00f9 00ff 00d6 00dc 00f8 00a3 00d8 00d7 0192 00e1 00ed 00f3 00fa 00f1 00d1 00aa 00ba 00bf 00ae 00ac 00bd 00bc 00a1 00ab 00bb 2591 2592 2593 2502 2524 00c1 00c2 00c0 00a9 2563 2551 2557 255d 00a2 00a5 2510 2514 2534 252c 251c 2500 253c 00e3 00c3 255a 2554 2569 2566 2560 2550 256c 00a4 00f0 00d0 00ca 00cb 00c8 0131 00cd 00ce 00cf 2518 250c 2588 2584 00a6 00cc 2580 00d3 00df 00d4 00d2 00f5 00d5 00b5 00fe 00de 00da 00db 00d9 00fd 00dd 00af 00b4 00ad 00b1 2017 00be 00b6 00a7 00f7 00b8 00b0 00a8 00b7 00b9 00b3 00b2 220e 00a0 --- # (find-tcsfile "tcs.c" "tab8859_1[256] =") cat > /tmp/latin1runes0 <<'---' 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f 10 11 12 13 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f 20 21 22 23 24 25 26 27 28 29 2a 2b 2c 2d 2e 2f 30 31 32 33 34 35 36 37 38 39 3a 3b 3c 3d 3e 3f 40 41 42 43 44 45 46 47 48 49 4a 4b 4c 4d 4e 4f 50 51 52 53 54 55 56 57 58 59 5a 5b 5c 5d 5e 5f 60 61 62 63 64 65 66 67 68 69 6a 6b 6c 6d 6e 6f 70 71 72 73 74 75 76 77 78 79 7a 7b 7c 7d 7e 7f 80 81 82 83 84 85 86 87 88 89 8a 8b 8c 8d 8e 8f 90 91 92 93 94 95 96 97 98 99 9a 9b 9c 9d 9e 9f a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aa ab ac ad ae af b0 b1 b2 b3 b4 b5 b6 b7 b8 b9 ba bb bc bd be bf c0 c1 c2 c3 c4 c5 c6 c7 c8 c9 ca cb cc cd ce cf d0 d1 d2 d3 d4 d5 d6 d7 d8 d9 da db dc dd de df e0 e1 e2 e3 e4 e5 e6 e7 e8 e9 ea eb ec ed ee ef f0 f1 f2 f3 f4 f5 f6 f7 f8 f9 fa fb fc fd fe ff --- expect -c ' foreach hex [exec cat /tmp/ps2runes0] {lappend ps2runes [expr 0x$hex]} puts [join [lsort -integer $ps2runes] "\n"] ' > /tmp/ps2runes.sort uniq -d /tmp/ps2runes.sort expect -c ' foreach hex [exec cat /tmp/latin1runes0] {lappend latin1runes [expr 0x$hex]} puts [join [lsort -integer $latin1runes] "\n"] ' > /tmp/latin1runes.sort uniq -d /tmp/latin1runes.sort # No repetitions in any of the two tables; this is very good! wc /tmp/ps2runes.sort wc /tmp/latin1runes.sort comm /tmp/latin1runes.sort /tmp/ps2runes.sort > /tmp/ocomm # (find-fline "/tmp/ocomm") #* # In fact things are much better yet. Inspecting the output of "comm" # we see that the runes that are in only one of the tables are # 128..159 for latin1 (that are blank) and some runes >=0x100 on # cp850: expect -c ' foreach hex [exec cat /tmp/ps2runes0] { if "0x$hex>256" { lappend ps2runes 0x$hex } } puts [join [lsort -integer $ps2runes] ", "] ' | tee ~/o # (find-fline "~/o") # 0x0131, 0x0192, 0x2017, 0x220e, 0x2500, 0x2502, 0x250c, 0x2510, # 0x2514, 0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, # 0x2551, 0x2554, 0x2557, 0x255a, 0x255d, 0x2560, 0x2563, 0x2566, # 0x2569, 0x256c, 0x2580, 0x2584, 0x2588, 0x2591, 0x2592, 0x2593, #* ##### # # tcs patch - adding the "latin1-850" charset # 2000jan06 # ##### # «tcs_patch» (to ".tcs_patch") # (to "tcs_latin1-850") # Patch tcs to add a new charset to it, "latin1-850", such that the # conversion latin1-850<->ps2/cp850 is reversible and equivalent to # latin1<->ps2/cp850 on all the usual chars. # (find-angg ".zshrc" "charsets") #* # (wrap nil) pdsc $SDEBIAN/dists/potato/main/source/text/tcs_1-6.dsc cd /usr/src/tcs-1/ patch -p0 <<'%%%' --- tcs.c.orig Wed Oct 23 19:13:20 1996 +++ tcs.c Sat Jan 6 19:18:08 2001 @@ -330,6 +330,31 @@ 0xf0,0xf1,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0xfa,0xfb,0xfc,0xfd,0xfe,0xff, }; +/* A special table with the cp850/ps2 runes in a latin1ish order + * See: (find-es "print" "tcs_patch") + */ +long tab8859_1_850[256] = +{ +0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f, +0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f, +0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2a,0x2b,0x2c,0x2d,0x2e,0x2f, +0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3a,0x3b,0x3c,0x3d,0x3e,0x3f, +0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x4b,0x4c,0x4d,0x4e,0x4f, +0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5a,0x5b,0x5c,0x5d,0x5e,0x5f, +0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x6b,0x6c,0x6d,0x6e,0x6f, +0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x7b,0x7c,0x7d,0x7e,0x7f, +0x0131, 0x0192, 0x2017, 0x220e, 0x2500, 0x2502, 0x250c, 0x2510, +0x2514, 0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550, +0x2551, 0x2554, 0x2557, 0x255a, 0x255d, 0x2560, 0x2563, 0x2566, +0x2569, 0x256c, 0x2580, 0x2584, 0x2588, 0x2591, 0x2592, 0x2593, +0xa0,0xa1,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xab,0xac,0xad,0xae,0xaf, +0xb0,0xb1,0xb2,0xb3,0xb4,0xb5,0xb6,0xb7,0xb8,0xb9,0xba,0xbb,0xbc,0xbd,0xbe,0xbf, +0xc0,0xc1,0xc2,0xc3,0xc4,0xc5,0xc6,0xc7,0xc8,0xc9,0xca,0xcb,0xcc,0xcd,0xce,0xcf, +0xd0,0xd1,0xd2,0xd3,0xd4,0xd5,0xd6,0xd7,0xd8,0xd9,0xda,0xdb,0xdc,0xdd,0xde,0xdf, +0xe0,0xe1,0xe2,0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xeb,0xec,0xed,0xee,0xef, +0xf0,0xf1,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0xfa,0xfb,0xfc,0xfd,0xfe,0xff, +}; + long tab8859_2[256] = { 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f, @@ -934,6 +959,7 @@ { "ascii", "7-bit ASCII", Table, (void *)tabascii }, { "8859-1", "Latin-1 (Western and Northern Europe including Italian)", Table, (void *)tab8859_1 }, { "latin1", "ISO 8859-1", Table, (void *)tab8859_1 }, + { "latin1-850", "Latin-1 with the ps2 runes (edrx)", Table, (void *)tab8859_1_850 }, { "8859-2", "Latin-2 (Eastern Europe except Turkey and the Baltic countries)", Table, (void *)tab8859_2 }, { "8859-3", "Latin-3 (Mediterranean, South Africa, Esperanto)", Table, (void *)tab8859_3 }, { "8859-4", "Latin-4 (Scandinavia and the Baltic countries; obsolete)", Table, (void *)tab8859_4 }, %%% patch -b -p0 debian/changelog <<'%%%' 0a1,6 > tcs (1-6edrx) custom; urgency=low > > * Added the latin1-850 charset. > > -- Eduardo Ochs <edrx@inx.com.br> Tue, 19 Jun 2001 22:00:00 +0200 > %%% cd /usr/src/tcs-1/ debian/rules binary |& tee odrb #* dpkg -i /usr/src/tcs_1-6edrx_i386.deb #* ##### # # Make a latin1-850 font from a cp850 font # 2000jan07 # ##### # «latin1-850_font» (to ".latin1-850_font") # (to "tcs_latin1-850") # (find-angg ".zshrc" "charsets") #* # Get the scrambling table expect -c ' for {set i 0; set s {}} {$i<256} {incr i} {append s [format %c $i]} puts -nonewline $s ' > /tmp/256 wc /tmp/256 isoto850 < /tmp/256 > /tmp/256b # 850toiso < /tmp/256 > /tmp/256b #* # Apply the scramble to a font cd ~/MTA/ expect -c ' proc readfile {fname} { set ch [open $fname r]; set bigstr [read $ch]; close $ch return $bigstr } proc writefile {fname bigstr} { set ch [open $fname w]; puts -nonewline $ch $bigstr; close $ch } proc ord {c} { scan $c %c n; return $n } proc fcharnew {n} { global map; fchar [ord [string index $map $n]] } proc fchar {n} { global origfont ht string range $origfont [expr $n*$ht] [expr $n*$ht+$ht-1] } set origfont [readfile ega1.8] set ht 8 set map [readfile /tmp/256b] for {set i 0; set s {}} {$i<256} {incr i} {append s [fcharnew $i]} writefile latin1-850.8 $s ' #* # (find-fline "~/MTA/vtutil" "examples of usage:") cd ~/MTA/ ./vtutilsh vtutil rowsofbigchars latin1-850.8 8 | l #* 850toiso < ~/MTA/vtutil > /tmp/vtutil-latin cd ~/MTA/ ./vtutilsh /tmp/vtutil-latin modifyfont 256 8 latin1-850.8 latinmath.8 ./vtutilsh /tmp/vtutil-latin rowsofbigchars latinmath.8 8 | l ./vtutilsh /tmp/vtutil-latin setfont 256 8 latinmath.8 file0 #* ~/MTA/vtutilsh /tmp/vtutil-latin ##### # # converting my public home stuff to latin1-850 # 2000jan07 # ##### # «page_to_latin1-850» (to ".page_to_latin1-850") # This is just a first test. Don't take it seriously. #* # (find-es "escripts" "makepagedeb_angg") makeLedrxtgz rm -R /tmp/edrxdeb mkdir /tmp/edrxdeb cd /tmp/edrxdeb cp -v ~root/bin/edrxpage . cp -v ~root/TH/L/a/s/edrx.tgz . chmod 755 edrxpage ./edrxpage make_package #* rm -Rv /tmp/e/ mkdir /tmp/e/ cd /tmp/e/ for i in $(cd $ES; print -l *.e); do echo $i; 850toiso < $ES/$i > $i; done # (find-fline "$ES/escripts.e") # (find-fline "$ES/tex.e" "ee-charset-indicator") # (find-fline "/tmp/e/tex.e") #* rm -Rv /tmp/edrx1 mkdir /tmp/edrx1 cd /tmp/edrx1 edrxpage THR # find * -type f | egrep '(8|16|gz|png)$' # find * -type f | egrep -v '(8|16|gz|png)$' for i in $(find * -type f | egrep -v '(8|16|gz|png)$'); do 850toiso < $i > o chmod --reference=$i o mv -v o $i done #* cd /tmp/edrx1 HOME=`pwd` zsh math #* ##### # # Which characters to use as charset indicators? # 2000jan09 # ##### # «charset_indicator» (to ".charset_indicator") # (to "latin1-850_font") #* perl -e 'for ($i=0; $i<256; ++$i) { printf "%c", $i }' > /tmp/256 # od -t x1 /tmp/256 # od -t u1 /tmp/256 isoto850 < /tmp/256 > /tmp/256b expect -c ' set scramble [exec cat /tmp/256b] proc ord {str} { scan $str "%c" ord; return $ord } proc char {n} { format %c $n } proc scramble {n} { global scramble ord [string index $scramble $n] } for {set i 128} {$i<256} {incr i} { set cycle $i for {set j [scramble $i]} {$j!=$i} {set j [scramble $j]} { lappend cycle $j } puts [format "legth %2d: %s" [llength $cycle] $cycle] } ' | sort | tee ~/o #* # We have three cycles of length 3, one of length 8, one of 15, one of # 23 and one of 73... # (+ 3 3 3 8 15 23 73) # Taking one representative of each cycle length among the chars in # the range 160..254 we get: 3->180 8->163 15->195 23->161 73->160 # (format "%c%c%c%c%c" 160 161 163 180 195) expect -c 'puts [format "%c%c%c%c%c" 160 161 163 180 195]' expect -c 'puts [format "%c%c%c%c%c" 160 161 163 180 195]' \ | 850toiso expect -c 'puts [format "%c%c%c%c%c" 160 161 163 180 195]' \ | isoto850 #* # (find-fline "~/o") # Note: the 23-cycle has only one char in the range 128..159, 148: # (148 188 172 170 166 221 237 161 173 240 208 209 165 190 243 162 189 # 171 174 169 184 247 246) ##### # # mapscrn # 2001jan23 # ##### # «mapscrn» (to ".mapscrn") # (find-status "kbd") # (find-vldifile "kbd.list") # (find-fline "/usr/doc/kbd/") # (eeman "8 mapscrn") # (eeman "4 console_codes" "Select user mapping") #* perl -e 'for ($i=0; $i<256; ++$i) { printf "%c", $i }' > /tmp/256 wc /tmp/256 isoto850 < /tmp/256 > /tmp/256a #* echo -ne '\e(U' mapscrn /tmp/256 # Straight to ROM mapping: # # ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ # ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞß # àáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ 850toiso < /tmp/256 > /tmp/256b mapscrn /tmp/256b echo -ne '\e(K' # # If the screen font is latinmath then this # will look like the math850 table: # # ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ # ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞß # àáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ isoto850 < /tmp/256 > /tmp/256a mapscrn /tmp/256a echo -ne '\e(K' # # If the screen font is math850 then this # will look like the latinmath table: # # ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ # ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞß # àáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ #* ##### # # recode # 2004sep26 / 2024sep01 # ##### # «recode» (to ".recode") # (find-status "recode") # (find-vldifile "recode.list") # (find-udfile "recode/") # (find-status "recode-doc") # (find-vldifile "recode-doc.list") # (find-udfile "recode-doc/") # (find-node "(recode)Top") # (find-node "(recode)flat") # (find-man "recode") # (find-sh "recode --help") # (find-sh "recode -l") # (find-sh "recode --list") # (find-sh "recode --verbose --list") # (find-sh "recode --verbose --list=flat") # https://unix.stackexchange.com/questions/631652/remove-accents-from-characters # recode -f utf8..flat < textin.txt > flattext.out #* # (find-node "(recode)ASCII") recode -lf us ;# for commented ASCII recode -ld us ;# for concise decimal table recode -lo us ;# for concise octal table recode -lh us ;# for concise hexadecimal table #* # Descriptions for all chars that recode knows about # (find-node "(recode)UCS-2") # (find-node "(recode)Test") # (find-node "(recode)Dump") # (find-fline "~/o2") echo -n \ | recode u2/test16..dump \ | tail +3 \ | tee > ~/o \ | awk ' NF==1 { if (!first) { first = $1 } last = $1 } NF>1 { if (first) { printf "%s..%s\n", first, last; first = "" } print } END { if (first) { printf "%s..%s\n", first, last; first = "" } } ' \ | tee ~/o2 #* # (find-node "(recode)Requests") # (find-node "(recode)Requests" "If the double\ndot separator is omitted") echo -n | recode -v ../x1 echo -n | recode -v /x1..u8 echo -n | recode -v l1 #* # (find-man "7 utf8") # (find-node "(recode)UTF-8") echo -n àáâãä | recode l1..dump | tail +3 echo -n àáâãä | recode l1..u2 | recode ../x1 echo -n Aaàáâãä | recode l1..u2 | recode ../x2 echo -n Aaàáâãä | recode l1..u8 | recode ../x1 #* for i in $(recode -l | tr ' ' \\n); do recode -lf $i done \ |& tee /tmp/o sort /tmp/o | uniq > /tmp/o2 #* # (find-fline "/tmp/o2") #* cd /tmp/ cat > accents.latin1 <<'%%%' `'^~" `'^~" a àáâãä ÀÁÂÃÄ åÅ e èéê ë ÈÉÊ Ë æÆ i ìíî ï ÌÍÎ Ï çÇ o òóôõö ÒÓÔÕÖ ñÑ u ùúû ü ÙÚÛ Ü ýÿ %%% recode l1..u8 < accents.latin1 > accents.utf8 #* # (find-man "1 less") LESSCHARSET=latin1 less -f /tmp/accents.{latin1,utf8} LESSCHARSET=utf-8 less -f /tmp/accents.{latin1,utf8} #* # (find-htetfile "Unicode-HOWTO.gz") #* cd /tmp/ debtarxvzf /hdd6/debian/dists/potato/main/binary-i386/utils/kbd_0.99-9.2.deb \ ./usr/bin/setfont cp -iv usr/bin/setfont ~/bin/ #* setfont ~/MTA/math1.8 loadkeys ~/MTA/defkeymap850b.map #* perl -e 'for ($i=0; $i<256; ++$i) { printf "%c", $i }' > /tmp/256 isoto850 < /tmp/256 > /tmp/256b # od -t x1 /tmp/256 # od -t u1 /tmp/256 #* perl -e 'for ($i=128; $i<256; ++$i) { printf "%c", $i }' > /tmp/128 isoto850 < /tmp/128 > /tmp/128b tr $(</tmp/128) $(</tmp/128b) < /tmp/128 | od -t x1 cat /tmp/128b | od -t x1 #* ##### # # Latin-1-ifying the output of pdftotext # 2012jan18 # ##### # «pdftotext» (to ".pdftotext") # (find-es "ps" "pdftotext") # (find-efunction 'brpdftextl) # (find-efunction 'find-pdf-text "-enc Latin1") # (find-man "1 pdftotext" "-enc encoding-name") ##### # # UTF-8 # 2013may30 # ##### # «utf-8» (to ".utf-8") # (find-es "lua5" "utf8") # (find-esgrep "grep -niH -e utf-8 *.e") # (find-esgrep "grep -niH -e utf8 *.e") # (find-man "7 utf-8") # (find-fline "/usr/share/i18n/charmaps/UTF-8.gz") # (find-fline "/usr/share/i18n/charmaps/UTF-8.gz" "ARABIC LETTER ALEF") ##### # # u8_to_l1 # 2014jul21 # ##### # «u8_to_l1» (to ".u8_to_l1") # (find-angg "LUA/lua50init.lua" "u8_to_l1") * (eepitch-shell) * (eepitch-kill) * (eepitch-shell) lua51 -e 'for i=192,252 do printf(" \\%d %c", i, i) end' | recode l1..u8 | lua51 -e ' f = function (c) return format("\\%d", string.byte(c)) end print((io.read():gsub("[\128-\255]", f))) ' ##### # # l1_to_u8 # 2019feb11 # ##### # «l1_to_u8» (to ".l1_to_u8") # \128 -> \194\128 # (...) # \191 -> \194\191 # \192 -> \195\128 # (...) # \255 -> \195\191 # (find-einsert '(194 10 (128 191) 10 (192 255))) # (find-angg "LUA/lua50init.lua" "toslashhex") * (eepitch-lua51) * (eepitch-kill) * (eepitch-lua51) bigstr = "" for i=128,255 do bigstr = bigstr..format("%d -> %c\n", i, i) end = bigstr writefile("/tmp/l1", bigstr) = getoutput("recode l1..u8 < /tmp/l1 > /tmp/u8") bigstr = readfile("/tmp/u8") = toslashhex(bigstr) ##### # # unicode-data # 2014sep20 # ##### # «unicode-data» (to ".unicode-data") # (find-status "unicode-data") # (find-vldifile "unicode-data.list") # (find-udfile "unicode-data/") http://www.fileformat.info/info/unicode/char/1d312/index.htm http://www.fileformat.info/info/unicode/char/1d312/browsertest.htm http://www.fileformat.info/info/unicode/char/1d40d/index.htm ##### # # unzip-unicode-problem # 2021jun16 # ##### # «unzip-unicode-problem» (to ".unzip-unicode-problem") # (find-man "unzip") # (find-sh "unzip") # (find-sh "unzip -hh") # (find-sh "unzip -hh" "-U [UNICODE enabled]") # (find-sh "unzip -hh" "-^") # (find-sh "unzip -hh" "Unicode:") * (eepitch-shell) * (eepitch-kill) * (eepitch-shell) # (find-fline "/tmp/tark/") rm -Rv /tmp/tark/ mkdir /tmp/tark/ cd /tmp/tark/ unzip -U ~/tmp/tark-pequenos.zip # (find-fline "~/tmp/tark-pequenos.zip") <edrx> anyone knows 1) the name of this encoding 2) if recode can deal with it? when I unzip .zips that contains files with accents in their filenames I oftern get filenames like this: "Hoje n#U00e3o Haver#U00e1 Sa#U00edda Livre" <grym> maybe i can go swimming in it legally in 2022 https://thecharles.org/city-splash/ <homerj> edrx: I think if you light a candle and say that, you can talk to dead people <bpalmer> edrx: unicode e3 is latin small letter a with tilde; would that make sense? *** clone QUIT Quit: WeeChat 3.1 <homerj> apalmer, known for half lemonade/half iced tea <bpalmer> and similarly e1 is small letter a with bacute accent, and ed is small letter i with acute accent. <edrx> bpalmer: yes, that's it. after conversion that would be "Hoje não haverá saída livre". <homerj> bpalmer, known for memorizing unicode *** muto JOIN <bpalmer> edrx: so it looks like the dumbest possible way of generating a filename that every filesystem should accept. <bpalmer> (ignoring length) <edrx> bpalmer: agreed <bpalmer> I had to look up a unicode table, sadly. *** hmmmas JOIN *** holomorph JOIN <edrx> I can adapt this code - https://lists.gnu.org/archive/html/eev/2021-06/msg00010.html - to recognize this encoding, but it would be easier to just run "recode thisencoding..l1" on each filename. *** irek JOIN *** abhixec_ QUIT Ping timeout: 268 seconds <grym> rudybot: unicode sandwiches, that's what's for dinner <rudybot> grym: i buy my sandwiches in rods <grym> kinky ##### # # iconv # 2021sep06 # ##### # «iconv» (to ".iconv") # (find-man "1 iconv") # (find-fline "~/LOGS/2021sep06.emacs") # (find-fline "~/LOGS/2021sep06.emacs" "legendsofkyrandia12walk.htm") * (eepitch-shell) * (eepitch-kill) * (eepitch-shell) cd /tmp/ curl -s --insecure \ https://www.thecomputershow.com/computershow/walkthroughs/legendsofkyrandia12walk.htm \ | iconv --from CPIBM861 --to UTF-8//IGNORE \ > /tmp/o.html # (find-fline "/tmp/o.html") * (eepitch-lua51) * (eepitch-kill) * (eepitch-lua51) -- http://www.thecomputershow.com/computershow/walkthroughs/legendsofkyrandia12walk.htm fname = "$S/http/www.thecomputershow.com/computershow/walkthroughs/legendsofkyrandia12walk.htm" bigstr0 = ee_readfile(fname) lines0 = splitlines(bigstr0) lines = {} for i=546,746 do table.insert(lines, lines0[i]) end bigstr = table.concat(lines, "\n") seqs = SetL.new() for s in bigstr:gmatch("[\128-\255]+") do seqs:add(s) end = seqs:ksc() numberedseqs = SetL.new() for i,s in ipairs(seqs:ks()) do numberedseqs:add(s, i) PP(i, s) end # (find-fline "~/LOGS/2023feb03.emacs") # (find-fline "~/LOGS/2023feb03.emacs" "emoji-zwj-sequences") # https://unicode.org/emoji/charts/emoji-zwj-sequences.html # https://blog.emojipedia.org/fun-emoji-hacks/ 2012nov05, lua-l (from William Ahearn): http://docs.parrot.org/parrot/devel/html/docs/pdds/pdd28_strings.pod.html https://www.quora.com/Why-is-there-no-character-for-superscript-q-in-Unicode https://www.compart.com/en/unicode/U+02D9 # Local Variables: # coding: utf-8-unix # End: