e/charsets.e (htmlized)

Warning: this is an htmlized version!
The original is here, and
the conversion rules are here.
#######
#
# E-scripts on using codepage850, latin1+ and the math-enhanced
#   versions of these charsets
#
# Note 1: use the eev command (defined in eev.el) and the
# ee alias (in my .zshrc) to execute parts of this file.
# Executing this file as a whole makes no sense.
# An introduction to eev can be found here:
#
#   (find-eev-quick-intro)
#   http://angg.twu.net/eev-intros/find-eev-quick-intro.html
#
# Note 2: be VERY careful and make sure you understand what
# you're doing.
#
# Note 3: If you use a shell other than zsh things like |&
# and the for loops may not work.
#
# Note 4: I always run as root.
#
# Note 5: some parts are too old and don't work anymore. Some
# never worked.
#
# Note 6: the definitions for the find-xxxfile commands are on my
# .emacs.
#
# Note 7: if you see a strange command check my .zshrc -- it may
# be defined there as a function or an alias.
#
# Note 8: the sections without dates are always older than the
# sections with dates.
#
# This file is at <http://angg.twu.net/e/charsets.e>
#           or at <http://angg.twu.net/e/charsets.e.html>.
#        See also <http://angg.twu.net/emacs.html>,
#                 <http://angg.twu.net/.emacs[.html]>,
#                 <http://angg.twu.net/.zshrc[.html]>,
#                 <http://angg.twu.net/escripts.html>,
#             and <http://angg.twu.net/>.
#
#######




# «.tcs»			(to "tcs")
# «.tcs_latin1-850»		(to "tcs_latin1-850")
# «.tcs_patch»			(to "tcs_patch")
# «.latin1-850_font»		(to "latin1-850_font")
# «.page_to_latin1-850»		(to "page_to_latin1-850")
# «.charset_indicator»		(to "charset_indicator")
# «.mapscrn»			(to "mapscrn")
# «.recode»			(to "recode")
# «.pdftotext»			(to "pdftotext")
# «.utf-8»			(to "utf-8")
# «.u8_to_l1»			(to "u8_to_l1")
# «.l1_to_u8»			(to "l1_to_u8")
# «.unicode-data»		(to "unicode-data")
# «.unzip-unicode-problem»	(to "unzip-unicode-problem")
# «.iconv»			(to "iconv")



# (find-es "print" "a2ps-cp850")



#####
#
# tcs (for conversion between charsets)
# 2001jan03
#
#####

# «tcs»  (to ".tcs")
# (to "tcs_patch")
# (find-status "tcs")
# (find-vldifile "tcs.list")
# (find-fline "/usr/doc/tcs/")
# The .ps is just a ps version of the manpage:
#gv /usr/doc/tcs/tcs.ps.gz
# (eeman "1 tcs")
#*
pdsc $SDEBIAN/dists/potato/main/source/text/tcs_1-6.dsc
#*
tcs -lv |& tee ~/o
#*
# (find-fline "~/o")

# (code-c-d "tcs" "/usr/src/tcs-1/")
# (find-tcsfile "tcs.c" "850")
# (find-tcsfile "tcs.c" "tabps2[256] =")
# (find-tcsfile "tcs.c" "tab8859_1[256] =")
# (find-tcsfile "")
# (find-tcsfile "regress")
# (find-tcsfile "Makefile")

# (find-man "7 utf-8")





#####
#
# tcs: comparing the 850 and latin1 charsets and building latin1-850
# 2001jan03
#
#####

# «tcs_latin1-850»  (to ".tcs_latin1-850")
# (to "tcs")
# (to "tcs_patch")
# (to "latin1-850_font")
# Look for repetitions in the tables that interest me (850 and latin1)...
#*
# Test "lsort":
# (eeman "3tcl lsort")
expect -c '
  puts [lsort -integer {1 2 3 10 20 10 11}]
  puts [lsort -integer {1 2 3 10 20 10 11 0x0a}]
'
#*
# (find-tcsfile "tcs.c" "tabps2[256] =")
cat > /tmp/ps2runes0 <<'---'
  00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f 
  10 11 12 13 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f 
  20 21 22 23 24 25 26 27 28 29 2a 2b 2c 2d 2e 2f 
  30 31 32 33 34 35 36 37 38 39 3a 3b 3c 3d 3e 3f 
  40 41 42 43 44 45 46 47 48 49 4a 4b 4c 4d 4e 4f 
  50 51 52 53 54 55 56 57 58 59 5a 5b 5c 5d 5e 5f 
  60 61 62 63 64 65 66 67 68 69 6a 6b 6c 6d 6e 6f 
  70 71 72 73 74 75 76 77 78 79 7a 7b 7c 7d 7e 7f 
  00c7  00fc  00e9  00e2  00e4  00e0  00e5  00e7 
  00ea  00eb  00e8  00ef  00ee  00ec  00c4  00c5 
  00c9  00e6  00c6  00f4  00f6  00f2  00fb  00f9 
  00ff  00d6  00dc  00f8  00a3  00d8  00d7  0192 
  00e1  00ed  00f3  00fa  00f1  00d1  00aa  00ba 
  00bf  00ae  00ac  00bd  00bc  00a1  00ab  00bb 
  2591  2592  2593  2502  2524  00c1  00c2  00c0 
  00a9  2563  2551  2557  255d  00a2  00a5  2510 
  2514  2534  252c  251c  2500  253c  00e3  00c3 
  255a  2554  2569  2566  2560  2550  256c  00a4 
  00f0  00d0  00ca  00cb  00c8  0131  00cd  00ce  
  00cf  2518  250c  2588  2584  00a6  00cc  2580 
  00d3  00df  00d4  00d2  00f5  00d5  00b5  00fe 
  00de  00da  00db  00d9  00fd  00dd  00af  00b4 
  00ad  00b1  2017  00be  00b6  00a7  00f7  00b8 
  00b0  00a8  00b7  00b9  00b3  00b2  220e  00a0 
---

# (find-tcsfile "tcs.c" "tab8859_1[256] =")
cat > /tmp/latin1runes0 <<'---'
  00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f 
  10 11 12 13 14 15 16 17 18 19 1a 1b 1c 1d 1e 1f 
  20 21 22 23 24 25 26 27 28 29 2a 2b 2c 2d 2e 2f 
  30 31 32 33 34 35 36 37 38 39 3a 3b 3c 3d 3e 3f 
  40 41 42 43 44 45 46 47 48 49 4a 4b 4c 4d 4e 4f 
  50 51 52 53 54 55 56 57 58 59 5a 5b 5c 5d 5e 5f 
  60 61 62 63 64 65 66 67 68 69 6a 6b 6c 6d 6e 6f 
  70 71 72 73 74 75 76 77 78 79 7a 7b 7c 7d 7e 7f 
  80 81 82 83 84 85 86 87 88 89 8a 8b 8c 8d 8e 8f 
  90 91 92 93 94 95 96 97 98 99 9a 9b 9c 9d 9e 9f 
  a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aa ab ac ad ae af 
  b0 b1 b2 b3 b4 b5 b6 b7 b8 b9 ba bb bc bd be bf 
  c0 c1 c2 c3 c4 c5 c6 c7 c8 c9 ca cb cc cd ce cf 
  d0 d1 d2 d3 d4 d5 d6 d7 d8 d9 da db dc dd de df 
  e0 e1 e2 e3 e4 e5 e6 e7 e8 e9 ea eb ec ed ee ef 
  f0 f1 f2 f3 f4 f5 f6 f7 f8 f9 fa fb fc fd fe ff
---

expect -c '
  foreach hex [exec cat /tmp/ps2runes0] {lappend ps2runes [expr 0x$hex]}
  puts [join [lsort -integer $ps2runes] "\n"]
' > /tmp/ps2runes.sort
uniq -d /tmp/ps2runes.sort

expect -c '
  foreach hex [exec cat /tmp/latin1runes0] {lappend latin1runes [expr 0x$hex]}
  puts [join [lsort -integer $latin1runes] "\n"]
' > /tmp/latin1runes.sort
uniq -d /tmp/latin1runes.sort

# No repetitions in any of the two tables; this is very good!
wc /tmp/ps2runes.sort
wc /tmp/latin1runes.sort

comm /tmp/latin1runes.sort /tmp/ps2runes.sort > /tmp/ocomm
# (find-fline "/tmp/ocomm")

#*
# In fact things are much better yet. Inspecting the output of "comm"
# we see that the runes that are in only one of the tables are
# 128..159 for latin1 (that are blank) and some runes >=0x100 on
# cp850:

expect -c '
  foreach hex [exec cat /tmp/ps2runes0] {
    if "0x$hex>256" { lappend ps2runes 0x$hex }
  }
  puts [join [lsort -integer $ps2runes] ", "]
' | tee ~/o
# (find-fline "~/o")

# 0x0131, 0x0192, 0x2017, 0x220e, 0x2500, 0x2502, 0x250c, 0x2510,
# 0x2514, 0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550,
# 0x2551, 0x2554, 0x2557, 0x255a, 0x255d, 0x2560, 0x2563, 0x2566,
# 0x2569, 0x256c, 0x2580, 0x2584, 0x2588, 0x2591, 0x2592, 0x2593,

#*





#####
#
# tcs patch - adding the "latin1-850" charset
# 2000jan06
#
#####

# «tcs_patch»  (to ".tcs_patch")
# (to "tcs_latin1-850")
# Patch tcs to add a new charset to it, "latin1-850", such that the
# conversion latin1-850<->ps2/cp850 is reversible and equivalent to
# latin1<->ps2/cp850 on all the usual chars.
# (find-angg ".zshrc" "charsets")
#*
# (wrap nil)
pdsc $SDEBIAN/dists/potato/main/source/text/tcs_1-6.dsc
cd /usr/src/tcs-1/

patch -p0 <<'%%%'
--- tcs.c.orig	Wed Oct 23 19:13:20 1996
+++ tcs.c	Sat Jan  6 19:18:08 2001
@@ -330,6 +330,31 @@
 0xf0,0xf1,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0xfa,0xfb,0xfc,0xfd,0xfe,0xff,
 };
 
+/* A special table with the cp850/ps2 runes in a latin1ish order
+ * See: (find-es "print" "tcs_patch")
+ */
+long tab8859_1_850[256] =
+{
+0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,
+0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f,
+0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2a,0x2b,0x2c,0x2d,0x2e,0x2f,
+0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3a,0x3b,0x3c,0x3d,0x3e,0x3f,
+0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x4b,0x4c,0x4d,0x4e,0x4f,
+0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5a,0x5b,0x5c,0x5d,0x5e,0x5f,
+0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x6b,0x6c,0x6d,0x6e,0x6f,
+0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x7b,0x7c,0x7d,0x7e,0x7f,
+0x0131, 0x0192, 0x2017, 0x220e, 0x2500, 0x2502, 0x250c, 0x2510,
+0x2514, 0x2518, 0x251c, 0x2524, 0x252c, 0x2534, 0x253c, 0x2550,
+0x2551, 0x2554, 0x2557, 0x255a, 0x255d, 0x2560, 0x2563, 0x2566,
+0x2569, 0x256c, 0x2580, 0x2584, 0x2588, 0x2591, 0x2592, 0x2593,
+0xa0,0xa1,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xab,0xac,0xad,0xae,0xaf,
+0xb0,0xb1,0xb2,0xb3,0xb4,0xb5,0xb6,0xb7,0xb8,0xb9,0xba,0xbb,0xbc,0xbd,0xbe,0xbf,
+0xc0,0xc1,0xc2,0xc3,0xc4,0xc5,0xc6,0xc7,0xc8,0xc9,0xca,0xcb,0xcc,0xcd,0xce,0xcf,
+0xd0,0xd1,0xd2,0xd3,0xd4,0xd5,0xd6,0xd7,0xd8,0xd9,0xda,0xdb,0xdc,0xdd,0xde,0xdf,
+0xe0,0xe1,0xe2,0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xeb,0xec,0xed,0xee,0xef,
+0xf0,0xf1,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0xfa,0xfb,0xfc,0xfd,0xfe,0xff,
+};
+
 long tab8859_2[256] =
 {
 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,
@@ -934,6 +959,7 @@
 	{ "ascii", "7-bit ASCII", Table, (void *)tabascii },
 	{ "8859-1", "Latin-1 (Western and Northern Europe including Italian)", Table, (void *)tab8859_1 },
 	{ "latin1", "ISO 8859-1", Table, (void *)tab8859_1 },
+	{ "latin1-850", "Latin-1 with the ps2 runes (edrx)", Table, (void *)tab8859_1_850 },
 	{ "8859-2", "Latin-2 (Eastern Europe except Turkey and the Baltic countries)", Table, (void *)tab8859_2 },
 	{ "8859-3", "Latin-3 (Mediterranean, South Africa, Esperanto)", Table, (void *)tab8859_3 },
 	{ "8859-4", "Latin-4 (Scandinavia and the Baltic countries; obsolete)", Table, (void *)tab8859_4 },
%%%

patch -b -p0 debian/changelog <<'%%%'
0a1,6
> tcs (1-6edrx) custom; urgency=low
> 
>   * Added the latin1-850 charset.
> 
>  -- Eduardo Ochs <edrx@inx.com.br>  Tue, 19 Jun 2001 22:00:00 +0200
> 
%%%

cd /usr/src/tcs-1/
debian/rules binary	|& tee odrb
#*
dpkg -i /usr/src/tcs_1-6edrx_i386.deb

#*





#####
#
# Make a latin1-850 font from a cp850 font
# 2000jan07
#
#####

# «latin1-850_font»  (to ".latin1-850_font")
# (to "tcs_latin1-850")
# (find-angg ".zshrc" "charsets")

#*
# Get the scrambling table
expect -c '
  for {set i 0; set s {}} {$i<256} {incr i} {append s [format %c $i]}
  puts -nonewline $s
' > /tmp/256
wc /tmp/256
isoto850 < /tmp/256 > /tmp/256b
# 850toiso < /tmp/256 > /tmp/256b

#*
# Apply the scramble to a font
cd ~/MTA/
expect -c '
  proc readfile {fname} {
    set ch [open $fname r]; set bigstr [read $ch]; close $ch
    return $bigstr
  }
  proc writefile {fname bigstr} {
    set ch [open $fname w]; puts -nonewline $ch $bigstr; close $ch
  }
  proc ord {c} { scan $c %c n; return $n }
  proc fcharnew {n} { global map; fchar [ord [string index $map $n]] }
  proc fchar {n} { global origfont ht
    string range $origfont [expr $n*$ht] [expr $n*$ht+$ht-1]
  }
  set origfont [readfile ega1.8]
  set ht 8
  set map [readfile /tmp/256b]
  for {set i 0; set s {}} {$i<256} {incr i} {append s [fcharnew $i]}
  writefile latin1-850.8 $s
'
#*
# (find-fline "~/MTA/vtutil" "examples of usage:")
cd ~/MTA/
./vtutilsh vtutil rowsofbigchars latin1-850.8 8 | l

#*
850toiso < ~/MTA/vtutil > /tmp/vtutil-latin
cd ~/MTA/
./vtutilsh /tmp/vtutil-latin modifyfont 256 8 latin1-850.8 latinmath.8
./vtutilsh /tmp/vtutil-latin rowsofbigchars latinmath.8 8 | l
./vtutilsh /tmp/vtutil-latin setfont 256 8 latinmath.8  file0
#*


~/MTA/vtutilsh /tmp/vtutil-latin






#####
#
# converting my public home stuff to latin1-850
# 2000jan07
#
#####

# «page_to_latin1-850»  (to ".page_to_latin1-850")
# This is just a first test. Don't take it seriously.
#*
# (find-es "escripts" "makepagedeb_angg")
makeLedrxtgz
rm -R /tmp/edrxdeb
mkdir /tmp/edrxdeb
cd    /tmp/edrxdeb
cp -v ~root/bin/edrxpage .
cp -v ~root/TH/L/a/s/edrx.tgz .
chmod 755 edrxpage
./edrxpage make_package

#*
rm -Rv /tmp/e/
mkdir  /tmp/e/
cd     /tmp/e/
for i in $(cd $ES; print -l *.e); do echo $i; 850toiso < $ES/$i > $i; done

# (find-fline "$ES/escripts.e")
# (find-fline "$ES/tex.e" "ee-charset-indicator")
# (find-fline "/tmp/e/tex.e")

#*
rm -Rv /tmp/edrx1
mkdir  /tmp/edrx1
cd     /tmp/edrx1
edrxpage THR
# find * -type f | egrep    '(8|16|gz|png)$'
# find * -type f | egrep -v '(8|16|gz|png)$'
for i in $(find * -type f | egrep -v '(8|16|gz|png)$'); do
  850toiso < $i > o
  chmod --reference=$i o
  mv -v o $i
done

#*
cd     /tmp/edrx1
HOME=`pwd` zsh
math
#*





#####
#
# Which characters to use as charset indicators?
# 2000jan09
#
#####

# «charset_indicator»  (to ".charset_indicator")
# (to "latin1-850_font")
#*
perl -e 'for ($i=0; $i<256; ++$i) { printf "%c", $i }' > /tmp/256
# od -t x1 /tmp/256
# od -t u1 /tmp/256
isoto850 < /tmp/256 > /tmp/256b

expect -c '
  set scramble [exec cat /tmp/256b]
  proc ord {str} { scan $str "%c" ord; return $ord }
  proc char {n} { format %c $n }
  proc scramble {n} { global scramble
    ord [string index $scramble $n]
  }
  for {set i 128} {$i<256} {incr i} {
    set cycle $i
    for {set j [scramble $i]} {$j!=$i} {set j [scramble $j]} {
      lappend cycle $j
    }
    puts [format "legth %2d: %s" [llength $cycle] $cycle] 
  }
' | sort | tee ~/o
#*

# We have three cycles of length 3, one of length 8, one of 15, one of
# 23 and one of 73...
# (+ 3 3 3 8 15 23 73)

# Taking one representative of each cycle length among the chars in
# the range 160..254 we get: 3->180 8->163 15->195 23->161 73->160
# (format "%c%c%c%c%c" 160 161 163 180 195)

expect -c 'puts [format "%c%c%c%c%c" 160 161 163 180 195]'
expect -c 'puts [format "%c%c%c%c%c" 160 161 163 180 195]' \
  | 850toiso
expect -c 'puts [format "%c%c%c%c%c" 160 161 163 180 195]' \
  | isoto850

#*
# (find-fline "~/o")
# Note: the 23-cycle has only one char in the range 128..159, 148:
# (148 188 172 170 166 221 237 161 173 240 208 209 165 190 243 162 189
#  171 174 169 184 247 246)





#####
#
# mapscrn
# 2001jan23
#
#####

# «mapscrn»  (to ".mapscrn")
# (find-status "kbd")
# (find-vldifile "kbd.list")
# (find-fline "/usr/doc/kbd/")
# (eeman "8 mapscrn")
# (eeman "4 console_codes" "Select user mapping")
#*
perl -e 'for ($i=0; $i<256; ++$i) { printf "%c", $i }' > /tmp/256
wc /tmp/256
isoto850 < /tmp/256 > /tmp/256a
#*

echo -ne '\e(U'
mapscrn /tmp/256
# Straight to ROM mapping:
#
#  ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿
# ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞß
# àáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ


850toiso < /tmp/256 > /tmp/256b
mapscrn /tmp/256b
echo -ne '\e(K'
#
# If the screen font is latinmath then this
# will look like the math850 table:
#
#  ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿
# ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞß
# àáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ


isoto850 < /tmp/256 > /tmp/256a
mapscrn /tmp/256a
echo -ne '\e(K'
#
# If the screen font is math850 then this
# will look like the latinmath table:
#
#  ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿
# ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞß
# àáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ

#*






#####
#
# recode
# 2004sep26 / 2024sep01
#
#####

# «recode»  (to ".recode")
# (find-status   "recode")
# (find-vldifile "recode.list")
# (find-udfile   "recode/")
# (find-status   "recode-doc")
# (find-vldifile "recode-doc.list")
# (find-udfile   "recode-doc/")
# (find-node "(recode)Top")
# (find-node "(recode)flat")

# (find-man "recode")
# (find-sh "recode --help")
# (find-sh "recode -l")
# (find-sh "recode --list")
# (find-sh "recode --verbose --list")
# (find-sh "recode --verbose --list=flat")
# https://unix.stackexchange.com/questions/631652/remove-accents-from-characters
#   recode -f utf8..flat < textin.txt > flattext.out

#*
# (find-node "(recode)ASCII")
recode -lf us    ;# for commented ASCII
recode -ld us    ;# for concise decimal table
recode -lo us    ;# for concise octal table
recode -lh us    ;# for concise hexadecimal table

#*
# Descriptions for all chars that recode knows about
# (find-node "(recode)UCS-2")
# (find-node "(recode)Test")
# (find-node "(recode)Dump")
# (find-fline "~/o2")

echo -n \
  | recode u2/test16..dump \
  | tail +3 \
  | tee > ~/o \
  | awk '
      NF==1 {
        if (!first) { first = $1 }
        last = $1
      }
      NF>1 {
        if (first) { printf "%s..%s\n", first, last; first = "" }
        print
      }
      END { if (first) { printf "%s..%s\n", first, last; first = "" } }
    ' \
  | tee ~/o2
                 
#*
# (find-node "(recode)Requests")
# (find-node "(recode)Requests" "If the double\ndot separator is omitted")
echo -n | recode -v ../x1
echo -n | recode -v /x1..u8
echo -n | recode -v l1

#*
# (find-man "7 utf8")
# (find-node "(recode)UTF-8")
echo -n   àáâãä | recode l1..dump | tail +3
echo -n   àáâãä | recode l1..u2 | recode ../x1
echo -n Aaàáâãä | recode l1..u2 | recode ../x2
echo -n Aaàáâãä | recode l1..u8 | recode ../x1

#*
for i in $(recode -l | tr ' ' \\n); do
  recode -lf $i
done \
  |& tee /tmp/o
sort /tmp/o | uniq > /tmp/o2

#*
# (find-fline "/tmp/o2")






#*
cd /tmp/
cat > accents.latin1 <<'%%%'
  `'^~" `'^~"
a àáâãä ÀÁÂÃÄ  åÅ   
e èéê ë ÈÉÊ Ë  æÆ   
i ìíî ï ÌÍÎ Ï  çÇ   
o òóôõö ÒÓÔÕÖ  ñÑ   
u ùúû ü ÙÚÛ Ü  ýÿ
%%%

recode l1..u8 < accents.latin1 > accents.utf8

#*
# (find-man "1 less")
LESSCHARSET=latin1 less -f /tmp/accents.{latin1,utf8}
LESSCHARSET=utf-8  less -f /tmp/accents.{latin1,utf8}

#*




# (find-htetfile "Unicode-HOWTO.gz")




#*
cd /tmp/
debtarxvzf /hdd6/debian/dists/potato/main/binary-i386/utils/kbd_0.99-9.2.deb \
  ./usr/bin/setfont
cp -iv usr/bin/setfont ~/bin/
#*
setfont ~/MTA/math1.8
loadkeys ~/MTA/defkeymap850b.map
#*




perl -e 'for ($i=0; $i<256; ++$i) { printf "%c", $i }' > /tmp/256
isoto850 < /tmp/256 > /tmp/256b
# od -t x1 /tmp/256
# od -t u1 /tmp/256

#*
perl -e 'for ($i=128; $i<256; ++$i) { printf "%c", $i }' > /tmp/128
isoto850 < /tmp/128 > /tmp/128b
tr $(</tmp/128) $(</tmp/128b) < /tmp/128 | od -t x1
cat /tmp/128b | od -t x1
#*






#####
#
# Latin-1-ifying the output of pdftotext
# 2012jan18
#
#####

# «pdftotext»  (to ".pdftotext")
# (find-es "ps" "pdftotext")
# (find-efunction 'brpdftextl)
# (find-efunction 'find-pdf-text "-enc Latin1")
# (find-man "1 pdftotext" "-enc encoding-name")




#####
#
# UTF-8
# 2013may30
#
#####

# «utf-8» (to ".utf-8")
# (find-es "lua5" "utf8")
# (find-esgrep "grep -niH -e utf-8 *.e")
# (find-esgrep "grep -niH -e utf8 *.e")
# (find-man "7 utf-8")
# (find-fline "/usr/share/i18n/charmaps/UTF-8.gz")
# (find-fline "/usr/share/i18n/charmaps/UTF-8.gz" "ARABIC LETTER ALEF")



#####
#
# u8_to_l1
# 2014jul21
#
#####

# «u8_to_l1» (to ".u8_to_l1")
# (find-angg "LUA/lua50init.lua" "u8_to_l1")

* (eepitch-shell)
* (eepitch-kill)
* (eepitch-shell)
lua51 -e 'for i=192,252 do printf(" \\%d %c", i, i) end' |
  recode l1..u8 |
  lua51 -e '
    f = function (c) return format("\\%d", string.byte(c)) end
    print((io.read():gsub("[\128-\255]", f)))
  '


#####
#
# l1_to_u8
# 2019feb11
#
#####

# «l1_to_u8» (to ".l1_to_u8")
# \128 -> \194\128
#    (...)
# \191 -> \194\191
# \192 -> \195\128
#    (...)
# \255 -> \195\191

# (find-einsert '(194 10 (128 191) 10 (192 255)))
# (find-angg "LUA/lua50init.lua" "toslashhex")

* (eepitch-lua51)
* (eepitch-kill)
* (eepitch-lua51)
bigstr = ""
for i=128,255 do bigstr = bigstr..format("%d -> %c\n", i, i) end
= bigstr
writefile("/tmp/l1", bigstr)
= getoutput("recode l1..u8 < /tmp/l1 > /tmp/u8")
bigstr = readfile("/tmp/u8")
= toslashhex(bigstr)








#####
#
# unicode-data
# 2014sep20
#
#####

# «unicode-data» (to ".unicode-data")
# (find-status   "unicode-data")
# (find-vldifile "unicode-data.list")
# (find-udfile   "unicode-data/")

http://www.fileformat.info/info/unicode/char/1d312/index.htm
http://www.fileformat.info/info/unicode/char/1d312/browsertest.htm
http://www.fileformat.info/info/unicode/char/1d40d/index.htm




#####
#
# unzip-unicode-problem
# 2021jun16
#
#####

# «unzip-unicode-problem»  (to ".unzip-unicode-problem")

# (find-man "unzip")
# (find-sh "unzip")
# (find-sh "unzip -hh")
# (find-sh "unzip -hh" "-U   [UNICODE enabled]")
# (find-sh "unzip -hh" "-^")
# (find-sh "unzip -hh" "Unicode:")

* (eepitch-shell)
* (eepitch-kill)
* (eepitch-shell)
# (find-fline "/tmp/tark/")
rm -Rv /tmp/tark/
mkdir  /tmp/tark/
cd     /tmp/tark/
unzip -U ~/tmp/tark-pequenos.zip

# (find-fline "~/tmp/tark-pequenos.zip")



<edrx> anyone knows 1) the name of this encoding 2) if recode can
       deal with it? when I unzip .zips that contains files with
       accents in their filenames I oftern get filenames like this:
       "Hoje n#U00e3o Haver#U00e1 Sa#U00edda Livre"
<grym> maybe i can go swimming in it legally in 2022
       https://thecharles.org/city-splash/ 
<homerj> edrx: I think if you light a candle and say that, you can
         talk to dead people
<bpalmer> edrx: unicode e3 is latin small letter a with tilde;
          would that make sense?
*** clone QUIT Quit: WeeChat 3.1
<homerj> apalmer, known for half lemonade/half iced tea
<bpalmer> and similarly e1 is small letter a with bacute accent,
          and ed is small letter i with acute accent.
<edrx> bpalmer: yes, that's it. after conversion that would be
       "Hoje não haverá saída livre".
<homerj> bpalmer, known for memorizing unicode
*** muto JOIN
<bpalmer> edrx: so it looks like the dumbest possible way of
          generating a filename that every filesystem should
          accept.
<bpalmer> (ignoring length)
<edrx> bpalmer: agreed
<bpalmer> I had to look up a unicode table, sadly.
*** hmmmas JOIN
*** holomorph JOIN
<edrx> I can adapt this code -
       https://lists.gnu.org/archive/html/eev/2021-06/msg00010.html
       - to recognize this encoding, but it would be easier to just
       run "recode thisencoding..l1" on each filename.
*** irek JOIN
*** abhixec_ QUIT Ping timeout: 268 seconds
<grym> rudybot: unicode sandwiches, that's what's for dinner
<rudybot> grym: i buy my sandwiches in rods
<grym> kinky





#####
#
# iconv
# 2021sep06
#
#####

# «iconv»  (to ".iconv")
# (find-man "1 iconv")
# (find-fline "~/LOGS/2021sep06.emacs")
# (find-fline "~/LOGS/2021sep06.emacs" "legendsofkyrandia12walk.htm")

* (eepitch-shell)
* (eepitch-kill)
* (eepitch-shell)
cd /tmp/
curl -s --insecure \
  https://www.thecomputershow.com/computershow/walkthroughs/legendsofkyrandia12walk.htm \
  | iconv --from CPIBM861 --to UTF-8//IGNORE \
  > /tmp/o.html

# (find-fline "/tmp/o.html")


* (eepitch-lua51)
* (eepitch-kill)
* (eepitch-lua51)
-- http://www.thecomputershow.com/computershow/walkthroughs/legendsofkyrandia12walk.htm
fname = "$S/http/www.thecomputershow.com/computershow/walkthroughs/legendsofkyrandia12walk.htm"
bigstr0 = ee_readfile(fname)
lines0 = splitlines(bigstr0)
lines = {}
for i=546,746 do table.insert(lines, lines0[i]) end
bigstr = table.concat(lines, "\n")
seqs = SetL.new()
for s in bigstr:gmatch("[\128-\255]+") do
  seqs:add(s)
end
= seqs:ksc()

numberedseqs = SetL.new()
for i,s in ipairs(seqs:ks()) do
  numberedseqs:add(s, i)
  PP(i, s)
end





# (find-fline "~/LOGS/2023feb03.emacs")
# (find-fline "~/LOGS/2023feb03.emacs" "emoji-zwj-sequences")
# https://unicode.org/emoji/charts/emoji-zwj-sequences.html
# https://blog.emojipedia.org/fun-emoji-hacks/



2012nov05, lua-l (from William Ahearn):
http://docs.parrot.org/parrot/devel/html/docs/pdds/pdd28_strings.pod.html

https://www.quora.com/Why-is-there-no-character-for-superscript-q-in-Unicode

https://www.compart.com/en/unicode/U+02D9





#  Local Variables:
#  coding:               utf-8-unix
#  End: