Module:Text: Difference between revisions
From The Goon Show Depository
m (1 revision imported) |
en>Hike395 (factor data out into Module:Text/data, loaded via mw.loadData) |
||
Line 5: | Line 5: | ||
Text utilities | Text utilities | ||
]=] | ]=] | ||
local function fiatQuote( apply, alien, advance ) | local function fiatQuote( apply, alien, advance ) | ||
Line 127: | Line 16: | ||
advance = tonumber(advance) or 0 | advance = tonumber(advance) or 0 | ||
local suite | local suite | ||
local data = mw.loadData('Module:Text/data') | |||
local QuoteLang = data.QuoteLang | |||
local QuoteType = data.QuoteType | |||
local slang = alien:match( "^(%l+)-" ) | local slang = alien:match( "^(%l+)-" ) | ||
suite = QuoteLang[alien] or slang and QuoteLang[slang] or QuoteLang["en"] | suite = QuoteLang[alien] or slang and QuoteLang[slang] or QuoteLang["en"] | ||
Line 230: | Line 121: | ||
-- Returns: true, if CJK detected | -- Returns: true, if CJK detected | ||
s = s and tostring(s) or "" | s = s and tostring(s) or "" | ||
local patternCJK = mw.loadData('Module:Text/data').PatternCJK | |||
return mw.ustring.find( s, patternCJK ) ~= nil | return mw.ustring.find( s, patternCJK ) ~= nil | ||
end -- Text.containsCJK() | end -- Text.containsCJK() | ||
Line 294: | Line 174: | ||
-- Returns: true, if valid for latin only | -- Returns: true, if valid for latin only | ||
s = s and tostring(s) or "" --- ensure input is always string | s = s and tostring(s) or "" --- ensure input is always string | ||
local PatternLatin = mw.loadData('Module:Text/data').PatternLatin | |||
return mw.ustring.match(s, PatternLatin) ~= nil | return mw.ustring.match(s, PatternLatin) ~= nil | ||
end -- Text.isLatinRange() | end -- Text.isLatinRange() | ||
Line 309: | Line 189: | ||
return false | return false | ||
end | end | ||
local SeekQuote = mw.loadData('Module:Text/data').SeekQuote | |||
return mw.ustring.find( SeekQuote, s, 1, true ) ~= nil | return mw.ustring.find( SeekQuote, s, 1, true ) ~= nil | ||
end -- Text.isQuote() | end -- Text.isQuote() | ||
Line 398: | Line 261: | ||
-- or basic greek or cyrillic or symbols etc. | -- or basic greek or cyrillic or symbols etc. | ||
local cleanup, decomposed | local cleanup, decomposed | ||
local PatternCombined = mw.loadData('Module:Text/data').PatternCombined | |||
decomposed = mw.ustring.toNFD( adjust and tostring(adjust) or "" ) | decomposed = mw.ustring.toNFD( adjust and tostring(adjust) or "" ) | ||
cleanup = mw.ustring.gsub( decomposed, PatternCombined, "" ) | cleanup = mw.ustring.gsub( decomposed, PatternCombined, "" ) | ||
Line 420: | Line 276: | ||
-- Returns: true, if sentence terminated | -- Returns: true, if sentence terminated | ||
local r | local r | ||
local PatternTerminated = mw.loadData('Module:Text/data').PatternTerminated | |||
if mw.ustring.find( analyse, PatternTerminated ) then | if mw.ustring.find( analyse, PatternTerminated ) then | ||
r = true | r = true | ||
Line 476: | Line 325: | ||
-- Returns: string with non-latin parts enclosed in <span> | -- Returns: string with non-latin parts enclosed in <span> | ||
local r | local r | ||
local data = mw.loadData('Module:Text/data') | |||
local PatternLatin = data.PatternLatin | |||
local RangesLatin = data.RangesLatin | |||
local NumLatinRanges = data.NumLatinRanges | |||
if mw.ustring.match( adjust, PatternLatin ) then | if mw.ustring.match( adjust, PatternLatin ) then | ||
-- latin only, horizontal dashes, quotes | -- latin only, horizontal dashes, quotes | ||
Line 490: | Line 342: | ||
-- isLatin | -- isLatin | ||
local range | local range | ||
for i = 1, | -- NumLatinRanges has to be precomputed because # does not work from loadData | ||
for i = 1, NumLatinRanges do | |||
range = RangesLatin[ i ] | range = RangesLatin[ i ] | ||
if a >= range[ 1 ] and a <= range[ 2 ] then | if a >= range[ 1 ] and a <= range[ 2 ] then | ||
Line 569: | Line 422: | ||
local r | local r | ||
if about == "quote" then | if about == "quote" then | ||
data = mw.loadData('Module:Text/data') | |||
r = { } | r = { } | ||
r.QuoteLang = QuoteLang | r.QuoteLang = data.QuoteLang | ||
r.QuoteType = QuoteType | r.QuoteType = data.QuoteType | ||
end | end | ||
return r | return r | ||
end -- Text.test() | end -- Text.test() | ||
-- Non Unicode-aware version of mw.text.split and mw.text.gsplit | |||
-- based on [[phab:diffusion/ELUA/browse/master/includes/Engines/LuaCommon/lualib/mw.text.lua]] | |||
-- These run up to 60 times faster than the Unicode-aware versions | |||
Text.split = function ( text, pattern, plain ) | |||
local ret = {} | |||
for m in Text.gsplit( text, pattern, plain ) do | |||
ret[#ret+1] = m | |||
end | |||
return ret | |||
end | |||
Text.gsplit = function ( text, pattern, plain ) | |||
local s, l = 1, string.len( text ) | |||
return function () | |||
if s then | |||
local e, n = string.find( text, pattern, s, plain ) | |||
local ret | |||
if not e then | |||
ret = string.sub( text, s ) | |||
s = nil | |||
elseif n < e then | |||
-- Empty separator! | |||
ret = string.sub( text, s, e ) | |||
if e < l then | |||
s = e + 1 | |||
else | |||
s = nil | |||
end | |||
else | |||
ret = e > s and string.sub( text, s, e - 1 ) or '' | |||
s = n + 1 | |||
end | |||
return ret | |||
end | |||
end, nil, nil | |||
end | |||
-- Export | -- Export | ||
Line 756: | Line 644: | ||
end | end | ||
function p.split(frame) | |||
local text = frame.args.text or frame.args[1] or '' | |||
local pattern = frame.args.pattern or frame.args[2] or '' | |||
local plain = yesNo(frame.args.plain or frame.args[3]) | |||
local index = tonumber(frame.args.index) or tonumber(frame.args[4]) or 1 | |||
local a = Text.split(text, pattern, plain) | |||
if index < 0 then index = #a + index + 1 end | |||
return a[index] | |||
end | |||
Line 761: | Line 659: | ||
return Text.serial | return Text.serial | ||
end | end | ||
Revision as of 08:01, 21 September 2024
local yesNo = require("Module:Yesno")
local Text = { serial = "2022-07-21",
suite = "Text" }
--[=[
Text utilities
]=]
local function fiatQuote( apply, alien, advance )
-- Quote text
-- Parameter:
-- apply -- string, with text
-- alien -- string, with language code
-- advance -- number, with level 1 or 2
local r = apply and tostring(apply) or ""
alien = alien or "en"
advance = tonumber(advance) or 0
local suite
local data = mw.loadData('Module:Text/data')
local QuoteLang = data.QuoteLang
local QuoteType = data.QuoteType
local slang = alien:match( "^(%l+)-" )
suite = QuoteLang[alien] or slang and QuoteLang[slang] or QuoteLang["en"]
if suite then
local quotes = QuoteType[ suite ]
if quotes then
local space
if quotes[ 3 ] then
space = " "
else
space = ""
end
quotes = quotes[ advance ]
if quotes then
r = mw.ustring.format( "%s%s%s%s%s",
mw.ustring.char( quotes[ 1 ] ),
space,
apply,
space,
mw.ustring.char( quotes[ 2 ] ) )
end
else
mw.log( "fiatQuote() " .. suite )
end
end
return r
end -- fiatQuote()
Text.char = function ( apply, again, accept )
-- Create string from codepoints
-- Parameter:
-- apply -- table (sequence) with numerical codepoints, or nil
-- again -- number of repetitions, or nil
-- accept -- true, if no error messages to be appended
-- Returns: string
local r = ""
apply = type(apply) == "table" and apply or {}
again = math.floor(tonumber(again) or 1)
if again < 1 then
return ""
end
local bad = { }
local codes = { }
for _, v in ipairs( apply ) do
local n = tonumber(v)
if not n or (n < 32 and n ~= 9 and n ~= 10) then
table.insert(bad, tostring(v))
else
table.insert(codes, math.floor(n))
end
end
if #bad > 0 then
if not accept then
r = tostring( mw.html.create( "span" )
:addClass( "error" )
:wikitext( "bad codepoints: " .. table.concat( bad, " " )) )
end
return r
end
if #codes > 0 then
r = mw.ustring.char( unpack( codes ) )
if again > 1 then
r = r:rep(again)
end
end
return r
end -- Text.char()
local function trimAndFormat(args, fmt)
local result = {}
if type(args) ~= 'table' then
args = {args}
end
for _, v in ipairs(args) do
v = mw.text.trim(tostring(v))
if v ~= "" then
table.insert(result,fmt and mw.ustring.format(fmt, v) or v)
end
end
return result
end
Text.concatParams = function ( args, apply, adapt )
-- Concat list items into one string
-- Parameter:
-- args -- table (sequence) with numKey=string
-- apply -- string (optional); separator (default: "|")
-- adapt -- string (optional); format including "%s"
-- Returns: string
local collect = { }
return table.concat(trimAndFormat(args,adapt), apply or "|")
end -- Text.concatParams()
Text.containsCJK = function ( s )
-- Is any CJK code within?
-- Parameter:
-- s -- string
-- Returns: true, if CJK detected
s = s and tostring(s) or ""
local patternCJK = mw.loadData('Module:Text/data').PatternCJK
return mw.ustring.find( s, patternCJK ) ~= nil
end -- Text.containsCJK()
Text.removeDelimited = function (s, prefix, suffix)
-- Remove all text in s delimited by prefix and suffix (inclusive)
-- Arguments:
-- s = string to process
-- prefix = initial delimiter
-- suffix = ending delimiter
-- Returns: stripped string
s = s and tostring(s) or ""
prefix = prefix and tostring(prefix) or ""
suffix = suffix and tostring(suffix) or ""
local prefixLen = mw.ustring.len(prefix)
local suffixLen = mw.ustring.len(suffix)
if prefixLen == 0 or suffixLen == 0 then
return s
end
local i = s:find(prefix, 1, true)
local r = s
local j
while i do
j = r:find(suffix, i + prefixLen)
if j then
r = r:sub(1, i - 1)..r:sub(j+suffixLen)
else
r = r:sub(1, i - 1)
end
i = r:find(prefix, 1, true)
end
return r
end
Text.getPlain = function ( adjust )
-- Remove wikisyntax from string, except templates
-- Parameter:
-- adjust -- string
-- Returns: string
local r = Text.removeDelimited(adjust,"<!--","-->")
r = r:gsub( "(</?%l[^>]*>)", "" )
:gsub( "'''", "" )
:gsub( "''", "" )
:gsub( " ", " " )
return r
end -- Text.getPlain()
Text.isLatinRange = function (s)
-- Are characters expected to be latin or symbols within latin texts?
-- Arguments:
-- s = string to analyze
-- Returns: true, if valid for latin only
s = s and tostring(s) or "" --- ensure input is always string
local PatternLatin = mw.loadData('Module:Text/data').PatternLatin
return mw.ustring.match(s, PatternLatin) ~= nil
end -- Text.isLatinRange()
Text.isQuote = function ( s )
-- Is this character any quotation mark?
-- Parameter:
-- s = single character to analyze
-- Returns: true, if s is quotation mark
s = s and tostring(s) or ""
if s == "" then
return false
end
local SeekQuote = mw.loadData('Module:Text/data').SeekQuote
return mw.ustring.find( SeekQuote, s, 1, true ) ~= nil
end -- Text.isQuote()
Text.listToText = function ( args, adapt )
-- Format list items similar to mw.text.listToText()
-- Parameter:
-- args -- table (sequence) with numKey=string
-- adapt -- string (optional); format including "%s"
-- Returns: string
return mw.text.listToText(trimAndFormat(args, adapt))
end -- Text.listToText()
Text.quote = function ( apply, alien, advance )
-- Quote text
-- Parameter:
-- apply -- string, with text
-- alien -- string, with language code, or nil
-- advance -- number, with level 1 or 2, or nil
-- Returns: quoted string
apply = apply and tostring(apply) or ""
local mode, slang
if type( alien ) == "string" then
slang = mw.text.trim( alien ):lower()
else
slang = mw.title.getCurrentTitle().pageLanguage
if not slang then
-- TODO FIXME: Introduction expected 2017-04
slang = mw.language.getContentLanguage():getCode()
end
end
if advance == 2 then
mode = 2
else
mode = 1
end
return fiatQuote( mw.text.trim( apply ), slang, mode )
end -- Text.quote()
Text.quoteUnquoted = function ( apply, alien, advance )
-- Quote text, if not yet quoted and not empty
-- Parameter:
-- apply -- string, with text
-- alien -- string, with language code, or nil
-- advance -- number, with level 1 or 2, or nil
-- Returns: string; possibly quoted
local r = mw.text.trim( apply and tostring(apply) or "" )
local s = mw.ustring.sub( r, 1, 1 )
if s ~= "" and not Text.isQuote( s, advance ) then
s = mw.ustring.sub( r, -1, 1 )
if not Text.isQuote( s ) then
r = Text.quote( r, alien, advance )
end
end
return r
end -- Text.quoteUnquoted()
Text.removeDiacritics = function ( adjust )
-- Remove all diacritics
-- Parameter:
-- adjust -- string
-- Returns: string; all latin letters should be ASCII
-- or basic greek or cyrillic or symbols etc.
local cleanup, decomposed
local PatternCombined = mw.loadData('Module:Text/data').PatternCombined
decomposed = mw.ustring.toNFD( adjust and tostring(adjust) or "" )
cleanup = mw.ustring.gsub( decomposed, PatternCombined, "" )
return mw.ustring.toNFC( cleanup )
end -- Text.removeDiacritics()
Text.sentenceTerminated = function ( analyse )
-- Is string terminated by dot, question or exclamation mark?
-- Quotation, link termination and so on granted
-- Parameter:
-- analyse -- string
-- Returns: true, if sentence terminated
local r
local PatternTerminated = mw.loadData('Module:Text/data').PatternTerminated
if mw.ustring.find( analyse, PatternTerminated ) then
r = true
else
r = false
end
return r
end -- Text.sentenceTerminated()
Text.ucfirstAll = function ( adjust)
-- Capitalize all words
-- Arguments:
-- adjust = string to adjust
-- Returns: string with all first letters in upper case
adjust = adjust and tostring(adjust) or ""
local r = mw.text.decode(adjust,true)
local i = 1
local c, j, m
m = (r ~= adjust)
r = " "..r
while i do
i = mw.ustring.find( r, "%W%l", i )
if i then
j = i + 1
c = mw.ustring.upper( mw.ustring.sub( r, j, j ) )
r = string.format( "%s%s%s",
mw.ustring.sub( r, 1, i ),
c,
mw.ustring.sub( r, i + 2 ) )
i = j
end
end -- while i
r = r:sub( 2 )
if m then
r = mw.text.encode(r)
end
return r
end -- Text.ucfirstAll()
Text.uprightNonlatin = function ( adjust )
-- Ensure non-italics for non-latin text parts
-- One single greek letter might be granted
-- Precondition:
-- adjust -- string
-- Returns: string with non-latin parts enclosed in <span>
local r
local data = mw.loadData('Module:Text/data')
local PatternLatin = data.PatternLatin
local RangesLatin = data.RangesLatin
local NumLatinRanges = data.NumLatinRanges
if mw.ustring.match( adjust, PatternLatin ) then
-- latin only, horizontal dashes, quotes
r = adjust
else
local c
local j = false
local k = 1
local m = false
local n = mw.ustring.len( adjust )
local span = "%s%s<span dir='auto' style='font-style:normal'>%s</span>"
local flat = function ( a )
-- isLatin
local range
-- NumLatinRanges has to be precomputed because # does not work from loadData
for i = 1, NumLatinRanges do
range = RangesLatin[ i ]
if a >= range[ 1 ] and a <= range[ 2 ] then
return true
end
end -- for i
end -- flat()
local focus = function ( a )
-- char is not ambivalent
local r = ( a > 64 )
if r then
r = ( a < 8192 or a > 8212 )
else
r = ( a == 38 or a == 60 ) -- '&' '<'
end
return r
end -- focus()
local form = function ( a )
return string.format( span,
r,
mw.ustring.sub( adjust, k, j - 1 ),
mw.ustring.sub( adjust, j, a ) )
end -- form()
r = ""
for i = 1, n do
c = mw.ustring.codepoint( adjust, i, i )
if focus( c ) then
if flat( c ) then
if j then
if m then
if i == m then
-- single greek letter.
j = false
end
m = false
end
if j then
local nx = i - 1
local s = ""
for ix = nx, 1, -1 do
c = mw.ustring.sub( adjust, ix, ix )
if c == " " or c == "(" then
nx = nx - 1
s = c .. s
else
break -- for ix
end
end -- for ix
r = form( nx ) .. s
j = false
k = i
end
end
elseif not j then
j = i
if c >= 880 and c <= 1023 then
-- single greek letter?
m = i + 1
else
m = false
end
end
elseif m then
m = m + 1
end
end -- for i
if j and ( not m or m < n ) then
r = form( n )
else
r = r .. mw.ustring.sub( adjust, k )
end
end
return r
end -- Text.uprightNonlatin()
Text.test = function ( about )
local r
if about == "quote" then
data = mw.loadData('Module:Text/data')
r = { }
r.QuoteLang = data.QuoteLang
r.QuoteType = data.QuoteType
end
return r
end -- Text.test()
-- Non Unicode-aware version of mw.text.split and mw.text.gsplit
-- based on [[phab:diffusion/ELUA/browse/master/includes/Engines/LuaCommon/lualib/mw.text.lua]]
-- These run up to 60 times faster than the Unicode-aware versions
Text.split = function ( text, pattern, plain )
local ret = {}
for m in Text.gsplit( text, pattern, plain ) do
ret[#ret+1] = m
end
return ret
end
Text.gsplit = function ( text, pattern, plain )
local s, l = 1, string.len( text )
return function ()
if s then
local e, n = string.find( text, pattern, s, plain )
local ret
if not e then
ret = string.sub( text, s )
s = nil
elseif n < e then
-- Empty separator!
ret = string.sub( text, s, e )
if e < l then
s = e + 1
else
s = nil
end
else
ret = e > s and string.sub( text, s, e - 1 ) or ''
s = n + 1
end
return ret
end
end, nil, nil
end
-- Export
local p = { }
for _, func in ipairs({'containsCJK','isLatinRange','isQuote','sentenceTerminated'}) do
p[func] = function (frame)
return Text[func]( frame.args[ 1 ] or "" ) and "1" or ""
end
end
for _, func in ipairs({'getPlain','removeDiacritics','ucfirstAll','uprightNonlatin'}) do
p[func] = function (frame)
return Text[func]( frame.args[ 1 ] or "" )
end
end
function p.char( frame )
local params = frame:getParent().args
local story = params[ 1 ]
local codes, lenient, multiple
if not story then
params = frame.args
story = params[ 1 ]
end
if story then
local items = mw.text.split( mw.text.trim(story), "%s+" )
if #items > 0 then
local j
lenient = (yesNo(params.errors) == false)
codes = { }
multiple = tonumber( params[ "*" ] )
for _, v in ipairs( items ) do
j = tonumber((v:sub( 1, 1 ) == "x" and "0" or "") .. v)
table.insert( codes, j or v )
end
end
end
return Text.char( codes, multiple, lenient )
end
function p.concatParams( frame )
local args
local template = frame.args.template
if type( template ) == "string" then
template = mw.text.trim( template )
template = ( template == "1" )
end
if template then
args = frame:getParent().args
else
args = frame.args
end
return Text.concatParams( args,
frame.args.separator,
frame.args.format )
end
function p.listToFormat(frame)
local lists = {}
local pformat = frame.args["format"]
local sep = frame.args["sep"] or ";"
-- Parameter parsen: Listen
for k, v in pairs(frame.args) do
local knum = tonumber(k)
if knum then lists[knum] = v end
end
-- Listen splitten
local maxListLen = 0
for i = 1, #lists do
lists[i] = mw.text.split(lists[i], sep)
if #lists[i] > maxListLen then maxListLen = #lists[i] end
end
-- Ergebnisstring generieren
local result = ""
local result_line = ""
for i = 1, maxListLen do
result_line = pformat
for j = 1, #lists do
result_line = mw.ustring.gsub(result_line, "%%s", lists[j][i], 1)
end
result = result .. result_line
end
return result
end
function p.listToText( frame )
local args
local template = frame.args.template
if type( template ) == "string" then
template = mw.text.trim( template )
template = ( template == "1" )
end
if template then
args = frame:getParent().args
else
args = frame.args
end
return Text.listToText( args, frame.args.format )
end
function p.quote( frame )
local slang = frame.args[2]
if type( slang ) == "string" then
slang = mw.text.trim( slang )
if slang == "" then
slang = false
end
end
return Text.quote( frame.args[ 1 ] or "",
slang,
tonumber( frame.args[3] ) )
end
function p.quoteUnquoted( frame )
local slang = frame.args[2]
if type( slang ) == "string" then
slang = mw.text.trim( slang )
if slang == "" then
slang = false
end
end
return Text.quoteUnquoted( frame.args[ 1 ] or "",
slang,
tonumber( frame.args[3] ) )
end
function p.zip(frame)
local lists = {}
local seps = {}
local defaultsep = frame.args["sep"] or ""
local innersep = frame.args["isep"] or ""
local outersep = frame.args["osep"] or ""
-- Parameter parsen
for k, v in pairs(frame.args) do
local knum = tonumber(k)
if knum then lists[knum] = v else
if string.sub(k, 1, 3) == "sep" then
local sepnum = tonumber(string.sub(k, 4))
if sepnum then seps[sepnum] = v end
end
end
end
-- sofern keine expliziten Separatoren angegeben sind, den Standardseparator verwenden
for i = 1, math.max(#seps, #lists) do
if not seps[i] then seps[i] = defaultsep end
end
-- Listen splitten
local maxListLen = 0
for i = 1, #lists do
lists[i] = mw.text.split(lists[i], seps[i])
if #lists[i] > maxListLen then maxListLen = #lists[i] end
end
local result = ""
for i = 1, maxListLen do
if i ~= 1 then result = result .. outersep end
for j = 1, #lists do
if j ~= 1 then result = result .. innersep end
result = result .. (lists[j][i] or "")
end
end
return result
end
function p.split(frame)
local text = frame.args.text or frame.args[1] or ''
local pattern = frame.args.pattern or frame.args[2] or ''
local plain = yesNo(frame.args.plain or frame.args[3])
local index = tonumber(frame.args.index) or tonumber(frame.args[4]) or 1
local a = Text.split(text, pattern, plain)
if index < 0 then index = #a + index + 1 end
return a[index]
end
function p.failsafe()
return Text.serial
end
p.Text = function ()
return Text
end -- p.Text
return p