Modulo:undecorate
MODULO | ||
Memtesto ne disponeblas. |
- prilaboras signoĉenon konvertante certajn latinajn signojn kun ĉapeloj al krudaj latinaj literoj "A"..."Z" kaj "a"..."z"
- uzata en
{{sendekoraciigo}}
,{{la-ligilo}}
--[===[
MODULE "UNDECORATE" (undecorate)
"eo.wiktionary.org/wiki/Modulo:undecorate" <!--2023-Apr-19-->
Purpose: processes a string converting certain decorated Latin
letters to a raw Latin ASCII letters "A"..."Z" and "a"..."z"
Utilo: prilaboras signocxenon konvertante certajn dekoracihavajn latinajn
literojn al krudaj latinaj askiaj literoj "A"..."Z" kaj "a"..."z"
Manfaat: mengonversi sebuah string ...
Syfte: bearbetar en straeng genom att konvertera vissa dekorerade latinska
bokstaever till raaa latinska ASCII bokstaever "A"..."Z" och "a"..."z"
Used by templates / Uzata far sxablonoj:
- "SXablono:sendekoraciigo" <- "SXablono:deveno3" "SXablono:t"
"SXablono:alilivivo"
- "SXablono:la-ligilo"
Required submodules / Bezonataj submoduloj / Submodul yang diperlukan: none
This module can accept parameters whether sent to itself (own frame) or
to the caller (caller's frame). If there is a parameter "caller=true"
on the own frame then that own frame is discarded in favor of the
caller's one.
Parameters: * 1 anonymous and obligatory parameter
* UTF8 text (empty legal)
* 1 anonymous and optional parameter
* "1" to attack the additional chars (by default
they are kept unchanged)
Returned: * ASCII text (empty can occur)
This module is unbreakable (when called with correct module name
and function name).
Cxi tiu modulo estas nerompebla (kiam vokita kun gxustaj nomo de modulo
kaj nomo de funkcio).
Special diagnostic strings in output:
* "XXX" -- truncated or invalid UTF8 stream in input
* "YYY" -- 4-oct in input
* "ZZZ" -- internal error, broken "contabudkonv"
Format of data inside "contabudkonv":
* sequence of line subblocks
* UINT8 : target ASCII code
* n * UINT8 : sequence of source UTF8 char:s, b7 ZERO in last subsequent
octet means that the char is additional (ZERO shifts
the range from $80...$BF down to $00...$3F, conflict is
impossible since we have the length in advance seized from
beginning octet), the sequence of UTF8 char:s ends when
either a valid ASCII letter is encountered (coming from
next line, or ASCII "5" as the final termination value)
* ASCII "5" ($35) as termination value
]===]
local exporttable = {}
------------------------------------------------------------------------
---- CONSTANTS [O] ----
------------------------------------------------------------------------
local contabudkonv = {
[0]=65,195,129,195,128,195,130,195,131,195,5,195,4,196,128,196,130,
97,195,161,195,160,195,162,195,163,195,37,195,36,196,129,196,131,
67,195,135,196,134,196,8,
99,195,167,196,135,196,9,
69,196,146,196,148,195,139,195,137,195,138,
101,196,147,196,149,195,171,195,169,195,170,
71,196,28,
103,196,29,
72,196,36,
104,196,37,
73,196,170,196,172,195,143,195,141,
105,196,171,196,173,195,175,195,173,
74,196,52,
106,196,53,
78,195,145,
110,195,177,
79,197,140,197,142,195,147,195,148,195,149,195,22,197,144,
111,197,141,197,143,195,179,195,180,195,181,195,54,197,145,
83,197,28,
115,197,29,
85,197,170,197,172,195,156,195,154,195,153,197,174,197,176,
117,197,171,197,173,195,188,195,186,195,185,197,175,197,177,
89,197,184,200,178,
121,195,191,200,179,
90,197,189,
122,197,190,
53
}
------------------------------------------------------------------------
---- MATH FUNCTIONS [E] ----
------------------------------------------------------------------------
-- Local function MATHDIV
local function mathdiv (xdividens, xdivisero)
local resultdiv = 0 -- DIV operator lacks in LUA :-(
resultdiv = math.floor (xdividens / xdivisero)
return resultdiv
end--function mathdiv
-- Local function MATHMOD
local function mathmod (xdividendo, xdivisoro)
local resultmod = 0 -- MOD operator is "%" and bitwise AND operator lack too
resultmod = xdividendo % xdivisoro
return resultmod
end--function mathmod
------------------------------------------------------------------------
-- Local function MATHBITTEST
-- Find out whether single bit selected by ZERO-based index is "1" / "true".
-- Result has type "boolean".
-- Depends on functions :
-- [E] mathdiv mathmod
local function mathbittest (numincoming, numbitindex)
local boores = false
while true do
if ((numbitindex==0) or (numincoming==0)) then
break -- we have either reached our bit or run out of bits
end--if
numincoming = mathdiv(numincoming,2) -- shift right
numbitindex = numbitindex - 1 -- count down to ZERO
end--while
boores = (mathmod(numincoming,2)==1) -- pick bit
return boores
end--function mathbittest
------------------------------------------------------------------------
---- UTF8 FUNCTIONS [U] ----
------------------------------------------------------------------------
-- Local function LFULNUTF8CHAR
-- Evaluate length of a single UTF8 char in octet:s.
-- Input : * numbgoctet -- beginning octet of a UTF8 char
-- Output : * numlen1234x -- number 1...4 or ZERO if invalid
-- Does NOT thoroughly check the validity, looks at 1 octet only.
local function lfulnutf8char (numbgoctet)
local numlen1234x = 0
if (numbgoctet<128) then
numlen1234x = 1 -- $00...$7F -- ANSI/ASCII
end--if
if ((numbgoctet>=194) and (numbgoctet<=223)) then
numlen1234x = 2 -- $C2 to $DF
end--if
if ((numbgoctet>=224) and (numbgoctet<=239)) then
numlen1234x = 3 -- $E0 to $EF
end--if
if ((numbgoctet>=240) and (numbgoctet<=244)) then
numlen1234x = 4 -- $F0 to $F4
end--if
return numlen1234x
end--function lfulnutf8char
------------------------------------------------------------------------
-- Local function LFUSPLITB7NOW
-- Split a UINT8 into b0...b6 (b7 is always ONE) and separate b7 as boolean.
-- Called only from lfudokonv.
local function lfusplitb7now (nummain)
local boob7bit = false
boob7bit = mathbittest (nummain,7)
if (not boob7bit) then
nummain = nummain + 128
end--if
return nummain,boob7bit
end--function lfusplitb7now
------------------------------------------------------------------------
-- Local function LFUDOKONV
-- Try to convert ie undecorate one UTF8 char.
-- Input : * numinutf5len -- 2 or 3
-- * booalso5addi -- "true" to convert the additional ones too
-- Output : * strdukonv -- 4 possibilities
-- Depends on functions :
-- [U] lfulnutf8char lfusplitb7now
-- [E] mathbittest mathdiv mathmod
-- Depends on constants :
-- * table "contabudkonv" -- length is unknown but the end is
-- marked with value 53
-- Called only from lfuremovedeko.
-- Note that "numinutf5len" can be 2 or 3 only, do NOT call this otherwise.
-- Note the inverted meaning of "b7" after split in "lfusplitb7now":
-- * ONE default -- char is base ie NOT additional
-- * ZERO special -- char is additional and excluded from conversion unless
-- "booalso5addi" requests attack
local function lfudokonv (numinutf5len,numvlsrc0,numvlsrc1,numvlsrc2,booalso5addi)
local strdukonv = ''
local numresult = 0 -- 1 bad "YYY" 2 bad "ZZZ" 3 unchanged 4 YES replacement
local numdestascii = 0
local numsrindex = 0
local numpeeker0 = 0
local numpeeker1 = 0
local numpeeker2 = 0
local numpanjang = 0 -- only 2 or 3 this is peeked as oppo to "numinutf5len"
local boonexk = false -- ZERO ie "false" if the char is additional
if ((numinutf5len~=2) and (numinutf5len~=3)) then
numresult = 1 -- "YYY"
end--if
while true do -- search "contabudkonv" -- over destination ASCII chars
if (numresult~=0) then
break -- outer loop -- have a result
end--if
numdestascii = contabudkonv[numsrindex] -- must be ASCII
if ((numdestascii<65) or (numdestascii>122)) then
numresult = 2 -- "ZZZ"
break -- outer loop -- abort search due to broken static data
end--if
numsrindex = numsrindex + 1
while true do -- inner loop -- search "contabudkonv" -- over src UTF8 ch
numpeeker0 = contabudkonv[numsrindex]
if (numpeeker0<128) then -- next ASCII or terminator
if (numpeeker0==53) then
numresult = 3 -- no match, found terminator instead, keep unchanged
end--if
break -- inner loop only, repeek and reevaluate for outer loop
end--if
numsrindex = numsrindex + 1 -- do this AFTER check against ASCII !!!
numpanjang = lfulnutf8char (numpeeker0)
if ((numpanjang~=2) and (numpanjang~=3)) then
numresult = 2 -- "ZZZ"
break -- abort search due to broken static data
end--if
numpeeker1 = contabudkonv[numsrindex]
numpeeker1,boonexk = lfusplitb7now (numpeeker1)
numsrindex = numsrindex + 1
if (numpanjang==3) then
numpeeker2 = contabudkonv[numsrindex]
numpeeker2,boonexk = lfusplitb7now (numpeeker2) -- overwrite "boonexk"
numsrindex = numsrindex + 1
end--if
while true do -- fake loop
if (numpanjang~=numinutf5len) then
break -- wrong length, miss -- to join mark
end--if
if ((numpeeker0~=numvlsrc0) or (numpeeker1~=numvlsrc1)) then
break -- miss -- to join mark
end--if
if ((numpanjang==3) and (numpeeker2~=numvlsrc2)) then
break -- miss -- to join mark
end--if
if (boonexk or booalso5addi) then
numresult = 4 -- YES replacement, take the hit
else
numresult = 3 -- unchanged, discard the hit
end--if
break -- finally to join mark
end--while -- fake loop -- join mark
if (numresult~=0) then
break -- have a result -- abort inner loop
end--if
end--while -- inner loop -- search "contabudkonv" -- over src UTF8 ch
if (numresult~=0) then
break -- have a result -- abort outer loop too
end--if
end--while -- search "contabudkonv" -- over destination ASCII chars
if (numresult==1) then -- bad "numinutf5len"
strdukonv = "YYY"
end--if
if (numresult==2) then -- bad "contabudkonv"
strdukonv = "ZZZ"
end--if
if (numresult==3) then -- unchanged
if (numinutf5len==3) then
strdukonv = string.char(numvlsrc0,numvlsrc1,numvlsrc2)
else
strdukonv = string.char(numvlsrc0,numvlsrc1)
end--if
end--if
if (numresult==4) then -- replace
strdukonv = string.char(numdestascii)
end--if
return strdukonv
end--function lfudokonv
------------------------------------------------------------------------
-- Local function LFUREMOVEDEKO
-- Remove decorations from Latin characters in a string. There is a base set
-- of chars always attacked, and an additional set attacked only if requested
-- by a boolean variable.
-- Input : * strdedekrin -- empty is useless but cannot cause major harm
-- * booalso6addi -- "true" to attack the additional chars too
-- (by default they are left unchanged)
-- Output : * strautput -- either same number of UTF8 char:s but usually less
-- octet:s, or "XXX" "YYY" "ZZZ"
-- Depends on functions :
-- [U] lfulnutf8char lfusplitb7now lfudokonv
-- [E] mathbittest mathdiv mathmod
-- Depends on constants :
-- * table "contabudkonv" -- length is unknown but the end is
-- marked with value 53
-- Special diagnostic strings in output:
-- * "XXX" -- truncated or invalid UTF8 stream in input
-- * "YYY" -- 4-oct in input
-- * "ZZZ" -- internal error, broken "contabudkonv"
-- Format of data inside "contabudkonv":
-- * sequence of line subblocks
-- * UINT8 : target ASCII code
-- * n * UINT8 : sequence of source UTF8 char:s, b7 ZERO in last subsequent
-- octet means that the char is additional (ZERO shifts
-- the range from $80...$BF down to $00...$3F, conflict is
-- impossible since we have the length in advance seized from
-- beginning octet), the sequence of UTF8 char:s ends when
-- either a valid ASCII letter is encountered (coming from
-- next line, or ASCII "5" as the final termination value)
-- * ASCII "5" ($35) as termination value
local function lfuremovedeko (strdedekrin, booalso6addi)
local strautput = ''
local numsrclen = 0
local numsrcind = 0
local numvalsrle = 0
local numvalsrc0 = 0 -- 1-oct ANSI/ASCII
local numvalsrc1 = 0 -- 2-oct
local numvalsrc2 = 0 -- 3-oct !!!FIXME!!! 4-oct does NOT work yet
local function xxlodsb () -- upvalues "strdedekrin" (const) and "numsrcind" (upd)
local xx = 0
xx = string.byte (strdedekrin,(numsrcind+1),(numsrcind+1))
numsrcind = numsrcind + 1
return xx
end--function xxlodsb
numsrclen = string.len (strdedekrin)
numsrcind = 0 -- ZERO-based
while true do
if (numsrcind>=numsrclen) then
break -- done
end--if
numvalsrc0 = xxlodsb ()
numvalsrle = lfulnutf8char (numvalsrc0)
if (numvalsrle==1) then
strautput = strautput .. string.char (numvalsrc0) -- do not even attempt
else
if (((numvalsrle~=2) and (numvalsrle~=3)) or ((numsrcind+numvalsrle-1)>numsrclen)) then
strautput = strautput .. "XXX" -- truncated or invalid UTF8 stream in input
break -- outer loop
end--if
numvalsrc1 = xxlodsb () -- ZERO and ONE sudah excluded, must be 2 or 3
if (numvalsrle==3) then
numvalsrc2 = xxlodsb () -- 3-oct !!!FIXME!!! 4-oct does NOT work yet
end--if
strautput = strautput .. lfudokonv (numvalsrle,numvalsrc0,numvalsrc1,numvalsrc2,booalso6addi)
end--if
end--while
return strautput
end--function lfuremovedeko
------------------------------------------------------------------------
---- VARIABLES [R] ----
------------------------------------------------------------------------
function exporttable.ek (arxframent)
-- special type "args" AKA "arx"
local arxsomons = 0 -- metaized "args" from our own or caller's "frame"
-- general "str"
local strintxt = '' -- input string from [0]
local strret = '' -- result string
-- general "boo"
local booalso7edd = false -- from [1]
------------------------------------------------------------------------
---- MAIN [Z] ----
------------------------------------------------------------------------
---- GET THE ARX (ONE OF TWO) ----
arxsomons = arxframent.args -- "args" from our own "frame"
if (type(arxsomons)~="table") then
arxsomons = {} -- guard against indexing error from our own
end--if
if (arxsomons['caller']=="true") then
arxsomons = arxframent:getParent().args -- "args" from caller's "frame"
end--if
if (type(arxsomons)~="table") then
arxsomons = {} -- guard against indexing error again
end--if
---- GET THE PARAMETERS ----
strintxt = arxsomons[1]
if (type(strintxt)~='string') then
strintxt = ''
end--if
booalso7edd = (arxsomons[2]=='1') -- "1" AKA "true" to attack additional
---- CARRY OUT THE HARD WORK ----
strret = lfuremovedeko (strintxt, booalso7edd)
---- RETURN THE JUNK STRING ----
return strret -- can happen to be be empty
end--function
---- RETURN THE JUNK LUA TABLE ----
return exporttable