မော်ဂျူး:ja-parse
ပုံပန်းသွင်ပြင်
Documentation for this module may be created at မော်ဂျူး:ja-parse/doc
local export = {}
local len = mw.ustring.len
local sub = mw.ustring.sub
local gsub = mw.ustring.gsub
local find = mw.ustring.find
local match = mw.ustring.match
local gmatch = mw.ustring.gmatch
local m_ja = require('Module:ja')
local kanji_pattern = "㐀-䶵一-鿌\239\164\128-\239\171\153𠀀-"
local kana_pattern = 'ぁ-ゖァ-ヺー'
local japanese_pattern = kana_pattern .. kanji_pattern .. 'a-zA-Z0-9〆々'
local headword_templates = {
['ja-adj'] = true, ['ja-pos'] = true, ['ja-noun'] = true, ['ja-phrase'] = true,
['ja-verb'] = true, ['ja-verb form'] = true, ['ja-verb-suru'] = true,
}
local function find_headword_template(wikitext)
local index =
wikitext:find('{{ja%-adj[|}]') or
wikitext:find('{{ja%-pos[|}]') or
wikitext:find('{{ja%-noun[|}]') or
wikitext:find('{{ja%-phrase[|}]') or
wikitext:find('{{ja%-verb[|}]') or
wikitext:find('{{ja%-verb form[|}]') or
wikitext:find('{{ja%-verb%-suru[|}]')
if index then
-- This assumes that the template has matching braces.
return wikitext:match('%b{}', index)
end
end
local function parse_template(wikitext) -- only supports the simplest format
local template = wikitext
template = template:gsub('%[%[([^%[%]|]-)|([^%[%]|]-)%]%]', '[[%1`%2]]')
local name
local args = {}
for glob in mw.text.gsplit(template:gsub('^{{', ''):gsub('}}$', ''), '|') do
if not name then
name = glob
else
glob = glob:gsub('`', '|')
local key, value = match(glob, "(.-)=(.*)")
if key and value then
args[key] = value
else
table.insert(args, glob)
end
end
end
return name, args
end
local function contains(list, item)
for i = 1, #list do
if list[i] == item then return true end
end
return false
end
-- A function to parse Japanese entries, returning a list of etym sections, each having the form { wikitext, type = ( 'lemma' | 'redirect' | '' ), spellings = <a list of the term's modern spellings>, historical_spellings = <a list of the term's historical kana spellings> }. In case of multiple etymologies, each ===Etymology n=== part constitutes an etym section. Otherwise, the whole Japanese section minus any ===Kanji [n]=== subsections constitutes a single etym section.
-- Note: The function divides sections strictly by L3 headers. As a result:
-- (1) If an entry describes both a kanji and a single word, any templates beginning the word (such as {{ja-spellings}}) will be erroneously considered part of the kanji section above. This function only remedies the cases of {{ja-spellings}} and {{ja-kanjitab}}, by inserting an empty header === === above it before parsing. (This problem is absent for entries with multiple etymologies, since each word must begin with ===Etymology n===.)
-- (2) If an entry describes multiple words, word-specific templates such as {{topics|ja|Biology}} must now be placed at the end of the relevant word instead of the whole entry. If they are put at the end of the ==Japanese== entry, they will be either erroneously considered part of the final word or additional sections such as ===References===, and ignored when {{ja-see}} copies categories around.
function export.extract_etym_sections(lemma)
local page = mw.title.new(lemma):getContent() or ''
--local l2 = mw.ustring.match(page .. '\n----\n', '==ဂျပန်==\n(.-)%-%-%-%-')
local l2 = mw.ustring.match(page, '==ဂျပန်==\n(.-)%-%-%-%-') or mw.ustring.match(page, '==ဂျပန်==\n(.*)')
-- split into L3 sections
local l3_sections = {}
local multi_etym = false
if l2 then
-- special hack mentioned above
if not find(l2, '===ရင်းမြစ် ၁===') and (find(l2, '===ခန်ဂျိ===') or find(l2, '===ခန်ဂျိ %d+===')) then
l2 = gsub(l2, '{{ja%-spellings', '=== ===\n{{ja-spellings')
l2 = gsub(l2, '{{ja%-kanjitab', '=== ===\n{{ja-kanjitab')
end
local current_l3_title = ''
local current_l3_content = {}
for v in l2:gmatch('[^\n]+') do
if find(v, '^===[^=]') then
table.insert(l3_sections, { current_l3_title, table.concat(current_l3_content, '\n') })
current_l3_title = match(v, '^===([^=]+)')
if current_l3_title == 'ရင်းမြစ် ၁' then multi_etym = true end
current_l3_content = {}
end
table.insert(current_l3_content, v)
end
table.insert(l3_sections, { current_l3_title, table.concat(current_l3_content, '\n') })
end
-- group the L3 sections into etym sections
local etym_sections = {}
if multi_etym then
for _, v in ipairs(l3_sections) do
local header = v[1]
local content = v[2]
if find(header, '^ရင်းမြစ် %d+$') then
table.insert(etym_sections, content)
end
end
else
local word = {}
for _, v in ipairs(l3_sections) do
local header = v[1]
local content = v[2]
if not (header == 'ခန်ဂျိ' or find(header, '^ခန်ဂျိ %d+$')) then
table.insert(word, content)
end
end
word = table.concat(word, '\n')
table.insert(etym_sections, word)
end
-- finally, determine the type of each etym section
for i = 1, #etym_sections do
etym_section = etym_sections[i]
local ja_see = find(etym_section, '{{ja%-see[|}]') or find(etym_section, '{{ja%-see-kango[|}]')
if ja_see then
local spellings = { lemma }
for v in gmatch(match(etym_section, '.-}}', ja_see), '[' .. japanese_pattern .. ']+') do
table.insert(spellings, v)
end
etym_sections[i] = { etym_section, type = 'redirect', spellings = spellings, historical_spellings = {} }
else
local ja_forms = etym_section:find('{{ja%-spellings[|}]')
if ja_forms then
local spellings = { lemma }
local historical_spellings = {}
local name, args = parse_template(etym_section:match('%b{}', ja_forms))
for i = 1, #args do
table.insert(spellings, args[i])
end
table.insert(historical_spellings, args['h'])
for i = 2, 5 do
table.insert(historical_spellings, args['h' .. i])
end
if args['h6'] then error('ja-parse: I don\'t support more than five historical spellings at the moment. Please expand me.') end
etym_sections[i] = { etym_section, type = 'lemma', spellings = spellings, historical_spellings = historical_spellings }
else
local headword_template = find_headword_template(etym_section)
if headword_template then
local spellings = { lemma }
local historical_spellings = {}
local name, args = parse_template(headword_template)
for i = 1, #args do
local candidate = args[i]
if find(candidate, '[' .. japanese_pattern .. ']') then
table.insert(spellings, m_ja.remove_ruby_markup(candidate))
end
end
table.insert(historical_spellings, args.hhira)
table.insert(historical_spellings, args.hkata)
etym_sections[i] = { etym_section, type = 'lemma', spellings = spellings, historical_spellings = historical_spellings }
else
etym_sections[i] = { etym_section, type = '', spellings = {}, historical_spellings = {} }
end
end
end
end
return etym_sections
end
-- A function to parse Japanese entries based to the function above, but filters the result and finds the etym section with the alternative spelling given by the spelling, and returns it as wikitext.
function export.get_etym_section(lemma, spelling)
local words = export.extract_etym_sections(lemma)
local result = {}
local spellings = {}
for _, v in ipairs(words) do
if v.type == 'lemma' and contains(v.spellings, spelling) then
table.insert(result, v[1])
for _, spelling in ipairs(v.spellings) do
if not contains(spellings, spelling) then table.insert(spellings, spelling) end
end
end
end
local wikitext = table.concat(result, '\n')
return wikitext, spellings
end
function export.extract_definitions_and_categories(wikitext, lemma, alt_spelling, frame)
local def = {}
local cat = {}
local current_section = ''
for v in wikitext:gmatch('[^\n]+') do
if v:find('^#+ ') then
if not v:find('{{rfdef') and not (v:find('{{ja%-def|') and not v:find('|' .. alt_spelling .. '[|}]') and find(alt_spelling, '[' .. kanji_pattern .. ']')) then
table.insert(def, { v, pos = current_section })
end
elseif v:find('^===') then
current_section = v:gsub("^=*(.-)=*$", "%1")
else
table.insert(cat, v)
end
end
-- expand the other parts for categories
local cat = table.concat(cat, '\n')
cat = gsub(cat, '<ref', '')
local function process_template_header(a, b) -- if the template begins with "{{ja-usex|", a is "ja-usex" and b is "|".
local templates_to_exclude = {
-- These templates are ignored as an optimization since they don't generate categories.
['m'] = true, ['l'] = true, ['ja-l'] = true, ['ja-r'] = true, ['gloss'] = true,
['w'] = true, ['wp'] = true, ['swp'] = true, ['wikipedia'] = true,
['lang'] = true, ['furigana'] = true, ['wj'] = true, ['lj'] = true, ['ruby/ja-w2'] = true, ['ruby/ja'] = true, ['ruby'] = true,
['ja-kanji forms'] = true, ['w2'] = true, ['sense'] = true,
['IPAfont'] = true, ['IPAchar'] = true,
['ja-adj-infl'] = true, ['ja-i'] = true, ['ja-na'] = true, ['ja-adjdecl'] = true, ['ja-decl-na'] = true, ['ja-go-bu'] = true, ['ja-go-gu'] = true, ['ja-go-ku'] = true, ['ja-go-mu'] = true, ['ja-go-nu'] = true, ['ja-go-ou'] = true, ['ja-go-ru'] = true, ['ja-go-su'] = true, ['ja-go-tsu'] = true, ['ja-go-u'] = true, ['ja-honorific'] = true, ['ja-ichi'] = true, ['ja-kuru'] = true, ['ja-suru'] = true, ['ja-suru-i-ku'] = true, ['ja-suru-tsu'] = true, ['ja-verbconj'] = true, ['ja-verbconj-auto'] = true, ['ja-verbconj-row'] = true, ['ja-verbconjugation'] = true, ['ja-zuru'] = true,
['ja-kanji spellings'] = true, ['ja-ks'] = true, ['ja-spellings'] = true, ['ja-forms'] = true,
['Japanese first-person pronouns'] = true, ['der-top'] = true, ['der-bottom'] = true,
['der-mid'] = true, ['der-top3'] = true, ['der-top4'] = true, ['der-top5'] = true, ['rel-top'] = true,
['ja-uk'] = true,
-- These templates are ignored since they generate categories that are spelling-specific or that we're not interested in
['ja-kanjitab'] = true, ['ateji'] = true, ['ja-ateji'] = true, ['ja-kanji'] = true, ['ja-readings'] = true,
['juku'] = true, ['jukujikun'] = true, ['ja-jukujikun'] = true,
['ja-def'] = true, ['synonyms'] = true,
}
if templates_to_exclude[a] then
return '{{=' .. b
elseif headword_templates[a] then
local source_script = m_ja.script(lemma)
if source_script == 'Hira' or source_script == 'Kana' or source_script == 'Hira+Kana' then
return '{{' .. a .. '|hira=' .. lemma .. b
else
return '{{' .. a .. b
end
elseif a == 'ja-usex' or a:find('^quote') then -- special hack
return '[[Category:နမူနာအသုံး ပါဝင်သော ဂျပန် ဝေါဟာရများ]]{{=' .. b
else
return '{{' .. a .. b
end
end
cat = gsub(cat, '{{([^|}\n]+)\n?([|}])', process_template_header)
cat = gsub(cat, '{{ja%-pron.-}}', function(pron)
local result = ''
if not find(pron, '|noipa=') then result = result .. '[[Category:IPA အသံထွက် ပါဝင်သော ဂျပန် ဝေါဟာရများ]]' end
if find(pron, '|a=') or find(pron, '|audio=') then result = result .. '[[Category:အသံလင့်ခ်များ ပါဝင်သော ဂျပန် ဝေါဟာရများ]]' end
return result
end)
cat = frame:preprocess(cat)
local cat2 = {}
for i in gmatch(cat, '%[%[Category:.-%]%]') do table.insert(cat2, i) end
cat = table.concat(cat2)
-- one might want to modify the sortkeys here
return def, cat
end
--[[
function export.fetch_reading(title, _wikitext) -- the _wikitext parameter is only used for recursion
local wikitext
if _wikitext then
wikitext = _wikitext
else
local page = mw.title.new(title):getContent() or ''
local l2 = page:match('==ဂျပန်==\n(.-)%-%-%-%-') or page:match('==ဂျပန်==\n(.*)') or ''
if l2 == '' or l2:find('===ရင်းမြစ် ၁===') then
wikitext = ''
else
wikitext = l2
end
end
local kana
local romaji
local headword_template = find_headword_template(wikitext)
if headword_template then
for glob in mw.text.gsplit(headword_template:gsub('%}%}', '|'), '|') do
if match(glob, '^[%s%.%-%^' .. kana_pattern .. '、]+$') then
kana = glob; break
end
end
romaji = headword_template:match("%|rom%=([^%|%}]+)")
end
if not kana and match(title, '^[' .. kana_pattern .. '、]+$') then
kana = title
end
if not kana then
if _wikitext then
return nil
else
local main_entry = wikitext:match('{{ja%-see|(.-)[|}]')
if main_entry then
local new_wikitext = export.get_etym_section(main_entry, title)
return export.fetch_reading(main_entry, new_wikitext:gsub('===ရင်းမြစ် %d+===', ''))
else
return nil
end
end
end
if not romaji then
if headword_template:find('ja%-verb%|') then
kana = gsub(kana, 'う$', '.う')
elseif headword_template:find('ja%-adj') and (headword_template:find('%|infl=i') or headword_template:find('%|infl=い') or headword_template:find('%|decl=i') or headword_template:find('%|decl=い')) then
kana = gsub(kana, 'い$', '.い')
elseif headword_template:find('proper') then
kana = gsub(kana, '^', '^')
kana = gsub(kana, ' ', ' ^')
kana = gsub(kana, '%-', '-^')
end
romaji = m_ja.kana_to_romaji(kana)
end
kana = gsub(kana, '[%s%.%-%^]', '')
return kana, romaji
end
]]
return export