Modul:Text: Unterschied zwischen den Versionen

Aus FreeWiki
Wechseln zu: Navigation, Suche
te>Mps
te>PerfektesChaos
(2015-03-17)
Zeile 1: Zeile 1:
--[=[ 2015-01-15
+
--[=[ 2015-03-17
 
Text utilities
 
Text utilities
 
]=]
 
]=]
Zeile 6: Zeile 6:
  
 
local Text = { }
 
local Text = { }
local patternCJK       = false
+
local PatternCJK       = false
local patternLatin     = false
+
local PatternCombined  = false
local patternTerminated = false
+
local PatternLatin     = false
 +
local PatternTerminated = false
 +
local RangesLatin      = false
  
  
Zeile 55: Zeile 57:
 
     return r
 
     return r
 
end -- Text.containsCJK()
 
end -- Text.containsCJK()
 +
 +
 +
 +
Text.isLatinRange = function ( adjust )
 +
    -- Are characters expected to be latin or symbols within latin texts?
 +
    -- Precondition:
 +
    --    adjust  -- string, or nil for initialization
 +
    -- Returns: true, if valid for latin only
 +
    local r
 +
    if not RangesLatin then
 +
        RangesLatin = { {    7,  687 },
 +
                        { 7531, 7578 },
 +
                        { 7680, 7935 },
 +
                        { 8194, 8250 } }
 +
    end
 +
    if not PatternLatin then
 +
        local range
 +
        PatternLatin = "^["
 +
        for i = 1, #RangesLatin do
 +
            range = RangesLatin[ i ]
 +
            PatternLatin = PatternLatin ..
 +
                          mw.ustring.char( range[ 1 ], 45, range[ 2 ] )
 +
        end    -- for i
 +
        PatternLatin = PatternLatin .. "]*$"
 +
mw.log(PatternLatin)
 +
    end
 +
    if adjust then
 +
        if mw.ustring.match( adjust, PatternLatin ) then
 +
            r = true
 +
        else
 +
            r = false
 +
        end
 +
    end
 +
    return r
 +
end -- Text.isLatinRange()
  
  
Zeile 78: Zeile 115:
 
     return mw.text.listToText( collect )
 
     return mw.text.listToText( collect )
 
end -- Text.listToText()
 
end -- Text.listToText()
 +
 +
 +
 +
Text.removeDiacritics = function ( adjust )
 +
    -- Remove all diacritics
 +
    -- Parameter:
 +
    --    adjust  -- string
 +
    -- Returns: string; all latin letters should be ASCII
 +
    --                  or basic greek or cyrillic or symbols etc.
 +
    local cleanup, decomposed
 +
    if not PatternCombined then
 +
        PatternCombined = mw.ustring.char( 91,
 +
                                            0x0300, 45, 0x036F,
 +
                                            0x1AB0, 45, 0x1AFF,
 +
                                            0x1DC0, 45, 0x1DFF,
 +
                                            0xFE20, 45, 0xFE2F,
 +
                                          93 )
 +
    end
 +
    decomposed = mw.ustring.toNFD( adjust )
 +
    cleanup    = mw.ustring.gsub( decomposed, PatternCombined, "" )
 +
    return mw.ustring.toNFC( cleanup )
 +
end -- Text.removeDiacritics()
  
  
Zeile 88: Zeile 147:
 
     -- Returns: true, if sentence terminated
 
     -- Returns: true, if sentence terminated
 
     local r
 
     local r
     if not patternTerminated then
+
     if not PatternTerminated then
         patternTerminated = mw.ustring.char( 91,
+
         PatternTerminated = mw.ustring.char( 91,
 
                                             12290,
 
                                             12290,
 
                                             65281,
 
                                             65281,
Zeile 96: Zeile 155:
 
                             .. "!%.%?…][\"'%]‹›«»‘’“”]*$"
 
                             .. "!%.%?…][\"'%]‹›«»‘’“”]*$"
 
     end
 
     end
     if mw.ustring.find( analyse, patternTerminated ) then
+
     if mw.ustring.find( analyse, PatternTerminated ) then
 
         r = true
 
         r = true
 
     else
 
     else
Zeile 163: Zeile 222:
 
     -- Returns: string with non-latin parts enclosed in <span>
 
     -- Returns: string with non-latin parts enclosed in <span>
 
     local r
 
     local r
     if not patternLatin then
+
     Text.isLatinRange()
        patternLatin = mw.ustring.char(   94, 91,
+
     if mw.ustring.match( adjust, PatternLatin ) then
                                          7, 45,  591,
 
                                        8194, 45, 8250,
 
                                          93, 42, 36 )
 
    end
 
     if mw.ustring.match( adjust, patternLatin ) then
 
 
         -- latin only, horizontal dashes, quotes
 
         -- latin only, horizontal dashes, quotes
 
         r = adjust
 
         r = adjust
Zeile 178: Zeile 232:
 
         local m    = false
 
         local m    = false
 
         local n    = mw.ustring.len( adjust )
 
         local n    = mw.ustring.len( adjust )
         local span = "%s%s<span style='font-style:normal'>%s</span>"
+
         local span = "%s%s<span dir='auto' style='font-style:normal'>%s</span>"
 
         local flat = function ( a )
 
         local flat = function ( a )
                -- isLatin
+
                  -- isLatin
                return  a <= 591  or  ( a >= 8194 and  a <= 8250 )
+
                  local range
 +
                  for i = 1, #RangesLatin do
 +
                      range = RangesLatin[ i ]
 +
                      if a >= range[ 1 ] and  a <= range[ 2 ] then
 +
                          return true
 +
                      end
 +
                  end    -- for i
 
               end -- flat()
 
               end -- flat()
 
         local form = function ( a )
 
         local form = function ( a )
Zeile 231: Zeile 291:
 
                 m = m + 1
 
                 m = m + 1
 
             end
 
             end
         end -- for i
+
         end   -- for i
 
         if j  and  ( not m  or  m < n ) then
 
         if j  and  ( not m  or  m < n ) then
 
             r = form( n )
 
             r = form( n )
Zeile 265: Zeile 325:
 
function p.containsCJK( frame )
 
function p.containsCJK( frame )
 
     return Text.containsCJK( frame.args[ 1 ] or "" ) and "1" or ""
 
     return Text.containsCJK( frame.args[ 1 ] or "" ) and "1" or ""
 +
end
 +
 +
function p.isLatinRange( frame )
 +
    return Text.isLatinRange( frame.args[1] or "" ) and "1" or ""
 
end
 
end
  
Zeile 280: Zeile 344:
 
     end
 
     end
 
     return Text.listToText( args, frame.args.format )
 
     return Text.listToText( args, frame.args.format )
 +
end
 +
 +
function p.removeDiacritics( frame )
 +
    return Text.removeDiacritics( frame.args[1] or "" )
 
end
 
end
  
Zeile 300: Zeile 368:
 
local innersep = frame.args["isep"] or ""
 
local innersep = frame.args["isep"] or ""
 
local outersep = frame.args["osep"] or ""
 
local outersep = frame.args["osep"] or ""
+
 
 
-- Parameter parsen
 
-- Parameter parsen
 
for k, v in pairs(frame.args) do
 
for k, v in pairs(frame.args) do
Zeile 332: Zeile 400:
 
end
 
end
 
return result
 
return result
end
 
 
-- removes all diacritics from the input string, by decomposing the characters, removing the combining diacritical marks and recomposing the remaining characters
 
function p.removeDiacritics(frame)
 
local combiningDiacriticalMarks = "[" ..
 
mw.ustring.char(0x0300) .. "-" .. mw.ustring.char(0x036F) ..
 
mw.ustring.char(0x1AB0) .. "-" .. mw.ustring.char(0x1AFF) ..
 
mw.ustring.char(0x1DC0) .. "-" .. mw.ustring.char(0x1DFF) ..
 
mw.ustring.char(0xFE20) .. "-" .. mw.ustring.char(0xFE2F) ..
 
"]"
 
return mw.ustring.toNFC(mw.ustring.gsub(mw.ustring.toNFD(frame.args[1] or ""), combiningDiacriticalMarks, ""))
 
 
end
 
end
  

Version vom 20. März 2015, 11:11 Uhr

Die Dokumentation für dieses Modul kann unter Modul:Text/Doku erstellt werden

--[=[ 2015-03-17
Text utilities
]=]



local Text = { }
local PatternCJK        = false
local PatternCombined   = false
local PatternLatin      = false
local PatternTerminated = false
local RangesLatin       = false



Text.concatParams = function ( args, apply, adapt )
    -- Concat list items into one string
    -- Parameter:
    --     args   -- table (sequence) with numKey=string
    --     apply  -- string (optional); separator (default: "|")
    --     adapt  -- string (optional); format including "%s"
    -- Returns: string
    local collect = { }
    for k, v in pairs( args ) do
        if type( k ) == "number" then
            v = mw.text.trim( v )
            if v ~= "" then
                if adapt then
                    v = mw.ustring.format( adapt, v )
                end
                table.insert( collect, v )
            end
        end
    end
    return table.concat( collect,  apply or "|" )
end -- Text.concatParams()



Text.containsCJK = function ( analyse )
    -- Is any CJK code within?
    -- Parameter:
    --     analyse  -- string
    -- Returns: true, if CJK detected
    local r
    if not patternCJK then
        patternCJK = mw.ustring.char( 91,
                                       13312, 45,  40959,
                                      131072, 45, 178207,
                                      93 )
    end
    if mw.ustring.find( analyse, patternCJK ) then
        r = true
    else
        r = false
    end
    return r
end -- Text.containsCJK()



Text.isLatinRange = function ( adjust )
    -- Are characters expected to be latin or symbols within latin texts?
    -- Precondition:
    --     adjust  -- string, or nil for initialization
    -- Returns: true, if valid for latin only
    local r
    if not RangesLatin then
        RangesLatin = { {    7,  687 },
                        { 7531, 7578 },
                        { 7680, 7935 },
                        { 8194, 8250 } }
    end
    if not PatternLatin then
        local range
        PatternLatin = "^["
        for i = 1, #RangesLatin do
            range = RangesLatin[ i ]
            PatternLatin = PatternLatin ..
                           mw.ustring.char( range[ 1 ], 45, range[ 2 ] )
        end    -- for i
        PatternLatin = PatternLatin .. "]*$"
mw.log(PatternLatin)
    end
    if adjust then
        if mw.ustring.match( adjust, PatternLatin ) then
            r = true
        else
            r = false
        end
    end
    return r
end -- Text.isLatinRange()



Text.listToText = function ( args, adapt )
    -- Format list items similar to mw.text.listToText()
    -- Parameter:
    --     args   -- table (sequence) with numKey=string
    --     adapt  -- string (optional); format including "%s"
    -- Returns: string
    local collect = { }
    for k, v in pairs( args ) do
        if type( k ) == "number" then
            v = mw.text.trim( v )
            if v ~= "" then
                if adapt then
                    v = mw.ustring.format( adapt, v )
                end
                table.insert( collect, v )
            end
        end
    end
    return mw.text.listToText( collect )
end -- Text.listToText()



Text.removeDiacritics = function ( adjust )
    -- Remove all diacritics
    -- Parameter:
    --     adjust  -- string
    -- Returns: string; all latin letters should be ASCII
    --                  or basic greek or cyrillic or symbols etc.
    local cleanup, decomposed
    if not PatternCombined then
        PatternCombined = mw.ustring.char( 91,
                                            0x0300, 45, 0x036F,
                                            0x1AB0, 45, 0x1AFF,
                                            0x1DC0, 45, 0x1DFF,
                                            0xFE20, 45, 0xFE2F,
                                           93 )
    end
    decomposed = mw.ustring.toNFD( adjust )
    cleanup    = mw.ustring.gsub( decomposed, PatternCombined, "" )
    return mw.ustring.toNFC( cleanup )
end -- Text.removeDiacritics()



Text.sentenceTerminated = function ( analyse )
    -- Is string terminated by dot, question or exclamation mark?
    --     Quotation, link termination and so on granted
    -- Parameter:
    --     analyse  -- string
    -- Returns: true, if sentence terminated
    local r
    if not PatternTerminated then
        PatternTerminated = mw.ustring.char( 91,
                                             12290,
                                             65281,
                                             65294,
                                             65311 )
                            .. "!%.%?…][\"'%]‹›«»‘’“”]*$"
    end
    if mw.ustring.find( analyse, PatternTerminated ) then
        r = true
    else
        r = false
    end
    return r
end -- Text.sentenceTerminated()



Text.ucfirstAll = function ( adjust )
    -- Capitalize all words
    -- Precondition:
    --     adjust  -- string
    -- Returns: string with all first letters in upper case
    local r = " " .. adjust
    local i = 1
    local c, j, m
    if adjust:find( "&" ) then
        r = r:gsub( "&amp;",      "&#38;" )
             :gsub( "&lt;",       "&#60;" )
             :gsub( "&gt;",       "&#62;" )
             :gsub( "&nbsp;",    "&#160;" )
             :gsub( "&thinsp;", "&#8201;" )
             :gsub( "&zwnj;",   "&#8204;" )
             :gsub( "&zwj;",    "&#8205;" )
             :gsub( "&lrm;",    "&#8206;" )
             :gsub( "&rlm;",    "&#8207;" )
        m = true
    end
    while i do
        i = mw.ustring.find( r, "%W%l", i )
        if i then
            j = i + 1
            c = mw.ustring.upper( mw.ustring.sub( r, j, j ) )
            r = string.format( "%s%s%s",
                               mw.ustring.sub( r, 1, i ),
                               c,
                               mw.ustring.sub( r, i + 2 ) )
            i = j
        end
    end -- while i
    r = r:sub( 2 )
    if m then
        r = r:gsub(     "&#38;", "&amp;" )
             :gsub(     "&#60;", "&lt;" )
             :gsub(     "&#62;", "&gt;" )
             :gsub(    "&#160;", "&nbsp;" )
             :gsub(   "&#8201;", "&thinsp;" )
             :gsub(   "&#8204;", "&zwnj;" )
             :gsub(   "&#8205;", "&zwj;" )
             :gsub(   "&#8206;", "&lrm;" )
             :gsub(   "&#8207;", "&rlm;" )
             :gsub( "&#X(%x+);", "&#x%1;" )
    end
    return r
end -- Text.ucfirstAll()



Text.uprightNonlatin = function ( adjust )
    -- Ensure non-italics for non-latin text parts
    --     One single greek letter might be granted
    -- Precondition:
    --     adjust  -- string
    -- Returns: string with non-latin parts enclosed in <span>
    local r
    Text.isLatinRange()
    if mw.ustring.match( adjust, PatternLatin ) then
        -- latin only, horizontal dashes, quotes
        r = adjust
    else
        local c
        local j    = false
        local k    = 1
        local m    = false
        local n    = mw.ustring.len( adjust )
        local span = "%s%s<span dir='auto' style='font-style:normal'>%s</span>"
        local flat = function ( a )
                  -- isLatin
                  local range
                  for i = 1, #RangesLatin do
                      range = RangesLatin[ i ]
                      if a >= range[ 1 ]  and  a <= range[ 2 ] then
                          return true
                      end
                  end    -- for i
              end -- flat()
        local form = function ( a )
                return string.format( span,
                                      r,
                                      mw.ustring.sub( adjust, k, j - 1 ),
                                      mw.ustring.sub( adjust, j, a ) )
              end -- form()
        r = ""
        for i = 1, n do
            c = mw.ustring.codepoint( adjust, i, i )
            if c > 64  or  c == 38  or  c == 60 then    -- '&' '<'
                if flat( c ) then
                    if j then
                        if m then
                            if i == m then
                                -- single greek letter.
                                j = false
                            end
                            m = false
                        end
                        if j then
                            local nx = i - 1
                            local s  = ""
                            for ix = nx, 1, -1 do
                                c = mw.ustring.sub( adjust, ix, ix )
                                if c == " "  or  c == "(" then
                                    nx = nx - 1
                                    s  = c .. s
                                else
                                    break -- for ix
                                end
                            end -- for ix
                            r = form( nx ) .. s
                            j = false
                            k = i
                        end
                    end
                elseif not j then
                    j = i
                    if c >= 880  and  c <= 1023 then
                        -- single greek letter?
                        m = i + 1
                    else
                        m = false
                    end
                end
            elseif m then
                m = m + 1
            end
        end    -- for i
        if j  and  ( not m  or  m < n ) then
            r = form( n )
        else
            r = r .. mw.ustring.sub( adjust, k )
        end
    end
    return r
end -- Text.uprightNonlatin()



-- Export
local p = { }

function p.concatParams( frame )
    local args
    local template = frame.args.template
    if type( template ) == "string" then
        template = mw.text.trim( template )
        template = ( template == "1" )
    end
    if template then
        args = frame:getParent().args
    else
        args = frame.args
    end
    return Text.concatParams( args,
                              frame.args.separator,
                              frame.args.format )
end

function p.containsCJK( frame )
    return Text.containsCJK( frame.args[ 1 ] or "" ) and "1" or ""
end

function p.isLatinRange( frame )
    return Text.isLatinRange( frame.args[1] or "" ) and "1" or ""
end

function p.listToText( frame )
    local args
    local template = frame.args.template
    if type( template ) == "string" then
        template = mw.text.trim( template )
        template = ( template == "1" )
    end
    if template then
        args = frame:getParent().args
    else
        args = frame.args
    end
    return Text.listToText( args, frame.args.format )
end

function p.removeDiacritics( frame )
    return Text.removeDiacritics( frame.args[1] or "" )
end

function p.sentenceTerminated( frame )
    return Text.sentenceTerminated( frame.args[ 1 ] or "" ) and "1" or ""
end

function p.ucfirstAll( frame )
    return Text.ucfirstAll( frame.args[ 1 ] or "" )
end

function p.uprightNonlatin( frame )
    return Text.uprightNonlatin( frame.args[ 1 ] or "" )
end

function p.zip(frame)
	local lists = {}
	local seps = {}
	local defaultsep = frame.args["sep"] or ""
	local innersep = frame.args["isep"] or ""
	local outersep = frame.args["osep"] or ""

	-- Parameter parsen
	for k, v in pairs(frame.args) do
		local knum = tonumber(k)
		if knum then lists[knum] = v else
			if string.sub(k, 1, 3) == "sep" then
				local sepnum = tonumber(string.sub(k, 4))
				if sepnum then seps[sepnum] = v end
			end
		end
	end
	-- sofern keine expliziten Separatoren angegeben sind, den Standardseparator verwenden
	for i = 1, math.max(#seps, #lists) do
		if not seps[i] then seps[i] = defaultsep end
	end

	-- Listen splitten
	local maxListLen = 0
	for i = 1, #lists do
		lists[i] = mw.text.split(lists[i], seps[i])
		if #lists[i] > maxListLen then maxListLen = #lists[i] end
	end

	local result = ""
	for i = 1, maxListLen do
		if i ~= 1 then result = result .. outersep end
		for j = 1, #lists do
			if j ~= 1 then result = result .. innersep end
			result = result .. (lists[j][i] or "")
		end
	end
	return result
end

p.Text = function ()
    return Text
end -- p.Text

return p