Modul:Text: Unterschied zwischen den Versionen
te>Mps |
te>PerfektesChaos (2015-03-17) |
||
Zeile 1: | Zeile 1: | ||
− | --[=[ 2015- | + | --[=[ 2015-03-17 |
Text utilities | Text utilities | ||
]=] | ]=] | ||
Zeile 6: | Zeile 6: | ||
local Text = { } | local Text = { } | ||
− | local | + | local PatternCJK = false |
− | local | + | local PatternCombined = false |
− | local | + | local PatternLatin = false |
+ | local PatternTerminated = false | ||
+ | local RangesLatin = false | ||
Zeile 55: | Zeile 57: | ||
return r | return r | ||
end -- Text.containsCJK() | end -- Text.containsCJK() | ||
+ | |||
+ | |||
+ | |||
+ | Text.isLatinRange = function ( adjust ) | ||
+ | -- Are characters expected to be latin or symbols within latin texts? | ||
+ | -- Precondition: | ||
+ | -- adjust -- string, or nil for initialization | ||
+ | -- Returns: true, if valid for latin only | ||
+ | local r | ||
+ | if not RangesLatin then | ||
+ | RangesLatin = { { 7, 687 }, | ||
+ | { 7531, 7578 }, | ||
+ | { 7680, 7935 }, | ||
+ | { 8194, 8250 } } | ||
+ | end | ||
+ | if not PatternLatin then | ||
+ | local range | ||
+ | PatternLatin = "^[" | ||
+ | for i = 1, #RangesLatin do | ||
+ | range = RangesLatin[ i ] | ||
+ | PatternLatin = PatternLatin .. | ||
+ | mw.ustring.char( range[ 1 ], 45, range[ 2 ] ) | ||
+ | end -- for i | ||
+ | PatternLatin = PatternLatin .. "]*$" | ||
+ | mw.log(PatternLatin) | ||
+ | end | ||
+ | if adjust then | ||
+ | if mw.ustring.match( adjust, PatternLatin ) then | ||
+ | r = true | ||
+ | else | ||
+ | r = false | ||
+ | end | ||
+ | end | ||
+ | return r | ||
+ | end -- Text.isLatinRange() | ||
Zeile 78: | Zeile 115: | ||
return mw.text.listToText( collect ) | return mw.text.listToText( collect ) | ||
end -- Text.listToText() | end -- Text.listToText() | ||
+ | |||
+ | |||
+ | |||
+ | Text.removeDiacritics = function ( adjust ) | ||
+ | -- Remove all diacritics | ||
+ | -- Parameter: | ||
+ | -- adjust -- string | ||
+ | -- Returns: string; all latin letters should be ASCII | ||
+ | -- or basic greek or cyrillic or symbols etc. | ||
+ | local cleanup, decomposed | ||
+ | if not PatternCombined then | ||
+ | PatternCombined = mw.ustring.char( 91, | ||
+ | 0x0300, 45, 0x036F, | ||
+ | 0x1AB0, 45, 0x1AFF, | ||
+ | 0x1DC0, 45, 0x1DFF, | ||
+ | 0xFE20, 45, 0xFE2F, | ||
+ | 93 ) | ||
+ | end | ||
+ | decomposed = mw.ustring.toNFD( adjust ) | ||
+ | cleanup = mw.ustring.gsub( decomposed, PatternCombined, "" ) | ||
+ | return mw.ustring.toNFC( cleanup ) | ||
+ | end -- Text.removeDiacritics() | ||
Zeile 88: | Zeile 147: | ||
-- Returns: true, if sentence terminated | -- Returns: true, if sentence terminated | ||
local r | local r | ||
− | if not | + | if not PatternTerminated then |
− | + | PatternTerminated = mw.ustring.char( 91, | |
12290, | 12290, | ||
65281, | 65281, | ||
Zeile 96: | Zeile 155: | ||
.. "!%.%?…][\"'%]‹›«»‘’“”]*$" | .. "!%.%?…][\"'%]‹›«»‘’“”]*$" | ||
end | end | ||
− | if mw.ustring.find( analyse, | + | if mw.ustring.find( analyse, PatternTerminated ) then |
r = true | r = true | ||
else | else | ||
Zeile 163: | Zeile 222: | ||
-- Returns: string with non-latin parts enclosed in <span> | -- Returns: string with non-latin parts enclosed in <span> | ||
local r | local r | ||
− | + | Text.isLatinRange() | |
− | + | if mw.ustring.match( adjust, PatternLatin ) then | |
− | |||
− | |||
− | |||
− | |||
− | if mw.ustring.match( adjust, | ||
-- latin only, horizontal dashes, quotes | -- latin only, horizontal dashes, quotes | ||
r = adjust | r = adjust | ||
Zeile 178: | Zeile 232: | ||
local m = false | local m = false | ||
local n = mw.ustring.len( adjust ) | local n = mw.ustring.len( adjust ) | ||
− | local span = "%s%s<span style='font-style:normal'>%s</span>" | + | local span = "%s%s<span dir='auto' style='font-style:normal'>%s</span>" |
local flat = function ( a ) | local flat = function ( a ) | ||
− | + | -- isLatin | |
− | + | local range | |
+ | for i = 1, #RangesLatin do | ||
+ | range = RangesLatin[ i ] | ||
+ | if a >= range[ 1 ] and a <= range[ 2 ] then | ||
+ | return true | ||
+ | end | ||
+ | end -- for i | ||
end -- flat() | end -- flat() | ||
local form = function ( a ) | local form = function ( a ) | ||
Zeile 231: | Zeile 291: | ||
m = m + 1 | m = m + 1 | ||
end | end | ||
− | end -- for i | + | end -- for i |
if j and ( not m or m < n ) then | if j and ( not m or m < n ) then | ||
r = form( n ) | r = form( n ) | ||
Zeile 265: | Zeile 325: | ||
function p.containsCJK( frame ) | function p.containsCJK( frame ) | ||
return Text.containsCJK( frame.args[ 1 ] or "" ) and "1" or "" | return Text.containsCJK( frame.args[ 1 ] or "" ) and "1" or "" | ||
+ | end | ||
+ | |||
+ | function p.isLatinRange( frame ) | ||
+ | return Text.isLatinRange( frame.args[1] or "" ) and "1" or "" | ||
end | end | ||
Zeile 280: | Zeile 344: | ||
end | end | ||
return Text.listToText( args, frame.args.format ) | return Text.listToText( args, frame.args.format ) | ||
+ | end | ||
+ | |||
+ | function p.removeDiacritics( frame ) | ||
+ | return Text.removeDiacritics( frame.args[1] or "" ) | ||
end | end | ||
Zeile 300: | Zeile 368: | ||
local innersep = frame.args["isep"] or "" | local innersep = frame.args["isep"] or "" | ||
local outersep = frame.args["osep"] or "" | local outersep = frame.args["osep"] or "" | ||
− | + | ||
-- Parameter parsen | -- Parameter parsen | ||
for k, v in pairs(frame.args) do | for k, v in pairs(frame.args) do | ||
Zeile 332: | Zeile 400: | ||
end | end | ||
return result | return result | ||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
end | end | ||
Version vom 20. März 2015, 11:11 Uhr
Die Dokumentation für dieses Modul kann unter Modul:Text/Doku erstellt werden
--[=[ 2015-03-17
Text utilities
]=]
local Text = { }
local PatternCJK = false
local PatternCombined = false
local PatternLatin = false
local PatternTerminated = false
local RangesLatin = false
Text.concatParams = function ( args, apply, adapt )
-- Concat list items into one string
-- Parameter:
-- args -- table (sequence) with numKey=string
-- apply -- string (optional); separator (default: "|")
-- adapt -- string (optional); format including "%s"
-- Returns: string
local collect = { }
for k, v in pairs( args ) do
if type( k ) == "number" then
v = mw.text.trim( v )
if v ~= "" then
if adapt then
v = mw.ustring.format( adapt, v )
end
table.insert( collect, v )
end
end
end
return table.concat( collect, apply or "|" )
end -- Text.concatParams()
Text.containsCJK = function ( analyse )
-- Is any CJK code within?
-- Parameter:
-- analyse -- string
-- Returns: true, if CJK detected
local r
if not patternCJK then
patternCJK = mw.ustring.char( 91,
13312, 45, 40959,
131072, 45, 178207,
93 )
end
if mw.ustring.find( analyse, patternCJK ) then
r = true
else
r = false
end
return r
end -- Text.containsCJK()
Text.isLatinRange = function ( adjust )
-- Are characters expected to be latin or symbols within latin texts?
-- Precondition:
-- adjust -- string, or nil for initialization
-- Returns: true, if valid for latin only
local r
if not RangesLatin then
RangesLatin = { { 7, 687 },
{ 7531, 7578 },
{ 7680, 7935 },
{ 8194, 8250 } }
end
if not PatternLatin then
local range
PatternLatin = "^["
for i = 1, #RangesLatin do
range = RangesLatin[ i ]
PatternLatin = PatternLatin ..
mw.ustring.char( range[ 1 ], 45, range[ 2 ] )
end -- for i
PatternLatin = PatternLatin .. "]*$"
mw.log(PatternLatin)
end
if adjust then
if mw.ustring.match( adjust, PatternLatin ) then
r = true
else
r = false
end
end
return r
end -- Text.isLatinRange()
Text.listToText = function ( args, adapt )
-- Format list items similar to mw.text.listToText()
-- Parameter:
-- args -- table (sequence) with numKey=string
-- adapt -- string (optional); format including "%s"
-- Returns: string
local collect = { }
for k, v in pairs( args ) do
if type( k ) == "number" then
v = mw.text.trim( v )
if v ~= "" then
if adapt then
v = mw.ustring.format( adapt, v )
end
table.insert( collect, v )
end
end
end
return mw.text.listToText( collect )
end -- Text.listToText()
Text.removeDiacritics = function ( adjust )
-- Remove all diacritics
-- Parameter:
-- adjust -- string
-- Returns: string; all latin letters should be ASCII
-- or basic greek or cyrillic or symbols etc.
local cleanup, decomposed
if not PatternCombined then
PatternCombined = mw.ustring.char( 91,
0x0300, 45, 0x036F,
0x1AB0, 45, 0x1AFF,
0x1DC0, 45, 0x1DFF,
0xFE20, 45, 0xFE2F,
93 )
end
decomposed = mw.ustring.toNFD( adjust )
cleanup = mw.ustring.gsub( decomposed, PatternCombined, "" )
return mw.ustring.toNFC( cleanup )
end -- Text.removeDiacritics()
Text.sentenceTerminated = function ( analyse )
-- Is string terminated by dot, question or exclamation mark?
-- Quotation, link termination and so on granted
-- Parameter:
-- analyse -- string
-- Returns: true, if sentence terminated
local r
if not PatternTerminated then
PatternTerminated = mw.ustring.char( 91,
12290,
65281,
65294,
65311 )
.. "!%.%?…][\"'%]‹›«»‘’“”]*$"
end
if mw.ustring.find( analyse, PatternTerminated ) then
r = true
else
r = false
end
return r
end -- Text.sentenceTerminated()
Text.ucfirstAll = function ( adjust )
-- Capitalize all words
-- Precondition:
-- adjust -- string
-- Returns: string with all first letters in upper case
local r = " " .. adjust
local i = 1
local c, j, m
if adjust:find( "&" ) then
r = r:gsub( "&", "&" )
:gsub( "<", "<" )
:gsub( ">", ">" )
:gsub( " ", " " )
:gsub( " ", " " )
:gsub( "‌", "‌" )
:gsub( "‍", "‍" )
:gsub( "‎", "‎" )
:gsub( "‏", "‏" )
m = true
end
while i do
i = mw.ustring.find( r, "%W%l", i )
if i then
j = i + 1
c = mw.ustring.upper( mw.ustring.sub( r, j, j ) )
r = string.format( "%s%s%s",
mw.ustring.sub( r, 1, i ),
c,
mw.ustring.sub( r, i + 2 ) )
i = j
end
end -- while i
r = r:sub( 2 )
if m then
r = r:gsub( "&", "&" )
:gsub( "<", "<" )
:gsub( ">", ">" )
:gsub( " ", " " )
:gsub( " ", " " )
:gsub( "‌", "‌" )
:gsub( "‍", "‍" )
:gsub( "‎", "‎" )
:gsub( "‏", "‏" )
:gsub( "&#X(%x+);", "&#x%1;" )
end
return r
end -- Text.ucfirstAll()
Text.uprightNonlatin = function ( adjust )
-- Ensure non-italics for non-latin text parts
-- One single greek letter might be granted
-- Precondition:
-- adjust -- string
-- Returns: string with non-latin parts enclosed in <span>
local r
Text.isLatinRange()
if mw.ustring.match( adjust, PatternLatin ) then
-- latin only, horizontal dashes, quotes
r = adjust
else
local c
local j = false
local k = 1
local m = false
local n = mw.ustring.len( adjust )
local span = "%s%s<span dir='auto' style='font-style:normal'>%s</span>"
local flat = function ( a )
-- isLatin
local range
for i = 1, #RangesLatin do
range = RangesLatin[ i ]
if a >= range[ 1 ] and a <= range[ 2 ] then
return true
end
end -- for i
end -- flat()
local form = function ( a )
return string.format( span,
r,
mw.ustring.sub( adjust, k, j - 1 ),
mw.ustring.sub( adjust, j, a ) )
end -- form()
r = ""
for i = 1, n do
c = mw.ustring.codepoint( adjust, i, i )
if c > 64 or c == 38 or c == 60 then -- '&' '<'
if flat( c ) then
if j then
if m then
if i == m then
-- single greek letter.
j = false
end
m = false
end
if j then
local nx = i - 1
local s = ""
for ix = nx, 1, -1 do
c = mw.ustring.sub( adjust, ix, ix )
if c == " " or c == "(" then
nx = nx - 1
s = c .. s
else
break -- for ix
end
end -- for ix
r = form( nx ) .. s
j = false
k = i
end
end
elseif not j then
j = i
if c >= 880 and c <= 1023 then
-- single greek letter?
m = i + 1
else
m = false
end
end
elseif m then
m = m + 1
end
end -- for i
if j and ( not m or m < n ) then
r = form( n )
else
r = r .. mw.ustring.sub( adjust, k )
end
end
return r
end -- Text.uprightNonlatin()
-- Export
local p = { }
function p.concatParams( frame )
local args
local template = frame.args.template
if type( template ) == "string" then
template = mw.text.trim( template )
template = ( template == "1" )
end
if template then
args = frame:getParent().args
else
args = frame.args
end
return Text.concatParams( args,
frame.args.separator,
frame.args.format )
end
function p.containsCJK( frame )
return Text.containsCJK( frame.args[ 1 ] or "" ) and "1" or ""
end
function p.isLatinRange( frame )
return Text.isLatinRange( frame.args[1] or "" ) and "1" or ""
end
function p.listToText( frame )
local args
local template = frame.args.template
if type( template ) == "string" then
template = mw.text.trim( template )
template = ( template == "1" )
end
if template then
args = frame:getParent().args
else
args = frame.args
end
return Text.listToText( args, frame.args.format )
end
function p.removeDiacritics( frame )
return Text.removeDiacritics( frame.args[1] or "" )
end
function p.sentenceTerminated( frame )
return Text.sentenceTerminated( frame.args[ 1 ] or "" ) and "1" or ""
end
function p.ucfirstAll( frame )
return Text.ucfirstAll( frame.args[ 1 ] or "" )
end
function p.uprightNonlatin( frame )
return Text.uprightNonlatin( frame.args[ 1 ] or "" )
end
function p.zip(frame)
local lists = {}
local seps = {}
local defaultsep = frame.args["sep"] or ""
local innersep = frame.args["isep"] or ""
local outersep = frame.args["osep"] or ""
-- Parameter parsen
for k, v in pairs(frame.args) do
local knum = tonumber(k)
if knum then lists[knum] = v else
if string.sub(k, 1, 3) == "sep" then
local sepnum = tonumber(string.sub(k, 4))
if sepnum then seps[sepnum] = v end
end
end
end
-- sofern keine expliziten Separatoren angegeben sind, den Standardseparator verwenden
for i = 1, math.max(#seps, #lists) do
if not seps[i] then seps[i] = defaultsep end
end
-- Listen splitten
local maxListLen = 0
for i = 1, #lists do
lists[i] = mw.text.split(lists[i], seps[i])
if #lists[i] > maxListLen then maxListLen = #lists[i] end
end
local result = ""
for i = 1, maxListLen do
if i ~= 1 then result = result .. outersep end
for j = 1, #lists do
if j ~= 1 then result = result .. innersep end
result = result .. (lists[j][i] or "")
end
end
return result
end
p.Text = function ()
return Text
end -- p.Text
return p