မော်ဂျူး:ja-parse

Documentation for this module may be created at မော်ဂျူး:ja-parse/doc
local export = {}

local len = mw.ustring.len
local sub = mw.ustring.sub
local gsub = mw.ustring.gsub
local find = mw.ustring.find
local match = mw.ustring.match
local gmatch = mw.ustring.gmatch

local m_ja = require('Module:ja')

local kanji_pattern = "㐀-䶵一-鿌\239\164\128-\239\171\153𠀀-𯨟"
local kana_pattern = 'ぁ-ゖァ-ヺー'
local japanese_pattern = kana_pattern .. kanji_pattern .. 'ａ-ｚＡ-Ｚ０-９〆々'

local headword_templates = {
	['ja-adj'] = true, ['ja-pos'] = true, ['ja-noun'] = true, ['ja-phrase'] = true,
	['ja-verb'] = true, ['ja-verb form'] = true, ['ja-verb-suru'] = true, 
}

local function find_headword_template(wikitext)
	local index =
		wikitext:find('{{ja%-adj[|}]') or
		wikitext:find('{{ja%-pos[|}]') or
		wikitext:find('{{ja%-noun[|}]') or
		wikitext:find('{{ja%-phrase[|}]') or
		wikitext:find('{{ja%-verb[|}]') or
		wikitext:find('{{ja%-verb form[|}]') or
		wikitext:find('{{ja%-verb%-suru[|}]')
	if index then
		-- This assumes that the template has matching braces.
		return wikitext:match('%b{}', index)
	end
end

local function parse_template(wikitext) -- only supports the simplest format
	local template = wikitext
	template = template:gsub('%[%[([^%[%]|]-)|([^%[%]|]-)%]%]', '[[%1`%2]]')
	local name
	local args = {}
	for glob in mw.text.gsplit(template:gsub('^{{', ''):gsub('}}$', ''), '|') do
		if not name then
			name = glob
		else
			glob = glob:gsub('`', '|')
			local key, value = match(glob, "(.-)=(.*)")
			if key and value then
				args[key] = value
			else
				table.insert(args, glob)
			end
		end
	end
	return name, args
end

local function contains(list, item)
	for i = 1, #list do
		if list[i] == item then return true end
	end
	return false
end


-- A function to parse Japanese entries, returning a list of etym sections, each having the form { wikitext, type = ( 'lemma' | 'redirect' | '' ), spellings = <a list of the term's modern spellings>, historical_spellings = <a list of the term's historical kana spellings> }. In case of multiple etymologies, each ===Etymology n=== part constitutes an etym section. Otherwise, the whole Japanese section minus any ===Kanji [n]=== subsections constitutes a single etym section.
-- Note: The function divides sections strictly by L3 headers. As a result:
-- (1) If an entry describes both a kanji and a single word, any templates beginning the word (such as {{ja-spellings}}) will be erroneously considered part of the kanji section above. This function only remedies the cases of {{ja-spellings}} and {{ja-kanjitab}}, by inserting an empty header === === above it before parsing. (This problem is absent for entries with multiple etymologies, since each word must begin with ===Etymology n===.)
-- (2) If an entry describes multiple words, word-specific templates such as {{topics|ja|Biology}} must now be placed at the end of the relevant word instead of the whole entry. If they are put at the end of the ==Japanese== entry, they will be either erroneously considered part of the final word or additional sections such as ===References===, and ignored when {{ja-see}} copies categories around.

function export.extract_etym_sections(lemma)
	local page = mw.title.new(lemma):getContent() or ''
	--local l2 = mw.ustring.match(page .. '\n----\n', '==ဂျပန်==\n(.-)%-%-%-%-')
	local l2 = mw.ustring.match(page, '==ဂျပန်==\n(.-)%-%-%-%-') or mw.ustring.match(page, '==ဂျပန်==\n(.*)')
	
	-- split into L3 sections
	local l3_sections = {}
	local multi_etym = false
	
	if l2 then
		-- special hack mentioned above
		if not find(l2, '===ရင်းမြစ် ၁===') and (find(l2, '===ခန်ဂျိ===') or find(l2, '===ခန်ဂျိ %d+===')) then
			l2 = gsub(l2, '{{ja%-spellings', '=== ===\n{{ja-spellings')
			l2 = gsub(l2, '{{ja%-kanjitab', '=== ===\n{{ja-kanjitab')
		end
		
		local current_l3_title = ''
		local current_l3_content = {}
		for v in l2:gmatch('[^\n]+') do
			if find(v, '^===[^=]') then
				table.insert(l3_sections, { current_l3_title, table.concat(current_l3_content, '\n') })
				current_l3_title = match(v, '^===([^=]+)')
				if current_l3_title == 'ရင်းမြစ် ၁' then multi_etym = true end
				current_l3_content = {}
			end
			table.insert(current_l3_content, v)
		end
		table.insert(l3_sections, { current_l3_title, table.concat(current_l3_content, '\n') })
	end
	
	-- group the L3 sections into etym sections
	local etym_sections = {}
	if multi_etym then
		for _, v in ipairs(l3_sections) do
			local header = v[1]
			local content = v[2]
			if find(header, '^ရင်းမြစ် %d+$') then
				table.insert(etym_sections, content)
			end
		end
	else
		local word = {}
		for _, v in ipairs(l3_sections) do
			local header = v[1]
			local content = v[2]
			if not (header == 'ခန်ဂျိ' or find(header, '^ခန်ဂျိ %d+$')) then
				table.insert(word, content)
			end
		end
		word = table.concat(word, '\n')
		table.insert(etym_sections, word)
	end
	
	-- finally, determine the type of each etym section
	for i = 1, #etym_sections do
		etym_section = etym_sections[i]
		local ja_see = find(etym_section, '{{ja%-see[|}]') or find(etym_section, '{{ja%-see-kango[|}]')
		if ja_see then
			local spellings = { lemma }
			for v in gmatch(match(etym_section, '.-}}', ja_see), '[' .. japanese_pattern .. ']+') do
				table.insert(spellings, v)
			end
			etym_sections[i] = { etym_section, type = 'redirect', spellings = spellings, historical_spellings = {} }
		else
			local ja_forms = etym_section:find('{{ja%-spellings[|}]')
			if ja_forms then
				local spellings = { lemma }
				local historical_spellings = {}
				local name, args = parse_template(etym_section:match('%b{}', ja_forms))
				for i = 1, #args do 
					table.insert(spellings, args[i])
				end
				table.insert(historical_spellings, args['h'])
				for i = 2, 5 do 
					table.insert(historical_spellings, args['h' .. i])
				end
				if args['h6'] then error('ja-parse: I don\'t support more than five historical spellings at the moment. Please expand me.') end
				etym_sections[i] = { etym_section, type = 'lemma', spellings = spellings, historical_spellings = historical_spellings }
			else
				local headword_template = find_headword_template(etym_section)
				if headword_template then
					local spellings = { lemma }
					local historical_spellings = {}
					local name, args = parse_template(headword_template)
					for i = 1, #args do
						local candidate = args[i]
						if find(candidate, '[' .. japanese_pattern .. ']') then
							table.insert(spellings, m_ja.remove_ruby_markup(candidate))
						end
					end
					table.insert(historical_spellings, args.hhira)
					table.insert(historical_spellings, args.hkata)
					etym_sections[i] = { etym_section, type = 'lemma', spellings = spellings, historical_spellings = historical_spellings }
				else
					etym_sections[i] = { etym_section, type = '', spellings = {}, historical_spellings = {} }
				end
			end
		end
	end
	return etym_sections
end

-- A function to parse Japanese entries based to the function above, but filters the result and finds the etym section with the alternative spelling given by the spelling, and returns it as wikitext.
function export.get_etym_section(lemma, spelling)
	local words = export.extract_etym_sections(lemma)
	local result = {}
	local spellings = {}
	for _, v in ipairs(words) do
		if v.type == 'lemma' and contains(v.spellings, spelling) then
			table.insert(result, v[1])
			for _, spelling in ipairs(v.spellings) do
				if not contains(spellings, spelling) then table.insert(spellings, spelling) end
			end
		end
	end
	local wikitext = table.concat(result, '\n')
	return wikitext, spellings
end

function export.extract_definitions_and_categories(wikitext, lemma, alt_spelling, frame)
	local def = {}
	local cat = {}
	local current_section = ''
	
	for v in wikitext:gmatch('[^\n]+') do
		if v:find('^#+ ') then
			if not v:find('{{rfdef') and not (v:find('{{ja%-def|') and not v:find('|' .. alt_spelling .. '[|}]') and find(alt_spelling, '[' .. kanji_pattern .. ']')) then
				table.insert(def, { v, pos = current_section })
			end
		elseif v:find('^===') then
			current_section = v:gsub("^=*(.-)=*$", "%1")
		else
			table.insert(cat, v)
		end
	end
	
	-- expand the other parts for categories
	local cat = table.concat(cat, '\n')
	cat = gsub(cat, '<ref', '')
	local function process_template_header(a, b) -- if the template begins with "{{ja-usex|", a is "ja-usex" and b is "|". 
		local templates_to_exclude = {
			-- These templates are ignored as an optimization since they don't generate categories.
			['m'] = true, ['l'] = true, ['ja-l'] = true, ['ja-r'] = true, ['gloss'] = true,
			['w'] = true, ['wp'] = true, ['swp'] = true, ['wikipedia'] = true,
			['lang'] = true, ['furigana'] = true, ['wj'] = true, ['lj'] = true, ['ruby/ja-w2'] = true, ['ruby/ja'] = true, ['ruby'] = true,
			['ja-kanji forms'] = true, ['w2'] = true, ['sense'] = true, 
			['IPAfont'] = true, ['IPAchar'] = true,
			['ja-adj-infl'] = true, ['ja-i'] = true, ['ja-na'] = true, ['ja-adjdecl'] = true, ['ja-decl-na'] = true, ['ja-go-bu'] = true, ['ja-go-gu'] = true, ['ja-go-ku'] = true, ['ja-go-mu'] = true, ['ja-go-nu'] = true, ['ja-go-ou'] = true, ['ja-go-ru'] = true, ['ja-go-su'] = true, ['ja-go-tsu'] = true, ['ja-go-u'] = true, ['ja-honorific'] = true, ['ja-ichi'] = true, ['ja-kuru'] = true, ['ja-suru'] = true, ['ja-suru-i-ku'] = true, ['ja-suru-tsu'] = true, ['ja-verbconj'] = true, ['ja-verbconj-auto'] = true, ['ja-verbconj-row'] = true, ['ja-verbconjugation'] = true, ['ja-zuru'] = true, 
			['ja-kanji spellings'] = true, ['ja-ks'] = true, ['ja-spellings'] = true, ['ja-forms'] = true,
			['Japanese first-person pronouns'] = true, ['der-top'] = true, ['der-bottom'] = true, 
			['der-mid'] = true, ['der-top3'] = true, ['der-top4'] = true, ['der-top5'] = true, ['rel-top'] = true, 
			['ja-uk'] = true,
			
			-- These templates are ignored since they generate categories that are spelling-specific or that we're not interested in
			['ja-kanjitab'] = true, ['ateji'] = true, ['ja-ateji'] = true, ['ja-kanji'] = true, ['ja-readings'] = true, 
			['juku'] = true, ['jukujikun'] = true, ['ja-jukujikun'] = true, 
			['ja-def'] = true, ['synonyms'] = true,
		}
		if templates_to_exclude[a] then
			return '{{=' .. b
		elseif headword_templates[a] then
			local source_script = m_ja.script(lemma)
			if source_script == 'Hira' or source_script == 'Kana' or source_script == 'Hira+Kana' then
				return '{{' .. a .. '|hira=' .. lemma .. b
			else
				return '{{' .. a .. b
			end
		elseif a == 'ja-usex' or a:find('^quote') then -- special hack
			return '[[Category:နမူနာအသုံး ပါဝင်သော ဂျပန် ဝေါဟာရများ]]{{=' .. b
		else
			return '{{' .. a .. b
		end
	end
	cat = gsub(cat, '{{([^|}\n]+)\n?([|}])', process_template_header)
	cat = gsub(cat, '{{ja%-pron.-}}', function(pron)
		local result = ''
		if not find(pron, '|noipa=') then result = result .. '[[Category:IPA အသံထွက် ပါဝင်သော ဂျပန် ဝေါဟာရများ]]' end
		if find(pron, '|a=') or find(pron, '|audio=') then result = result .. '[[Category:အသံလင့်ခ်များ ပါဝင်သော ဂျပန် ဝေါဟာရများ]]' end
		return result
		end)
	cat = frame:preprocess(cat)
	
	local cat2 = {}
	for i in gmatch(cat, '%[%[Category:.-%]%]') do table.insert(cat2, i) end
	cat = table.concat(cat2)
	-- one might want to modify the sortkeys here
	
	return def, cat
end

--[[
function export.fetch_reading(title, _wikitext) -- the _wikitext parameter is only used for recursion
	local wikitext
	if _wikitext then
		wikitext = _wikitext
	else
		local page = mw.title.new(title):getContent() or ''
		local l2 = page:match('==ဂျပန်==\n(.-)%-%-%-%-') or page:match('==ဂျပန်==\n(.*)') or ''
		if l2 == '' or l2:find('===ရင်းမြစ် ၁===') then
			wikitext = ''
		else
			wikitext = l2
		end
	end
	
	local kana
	local romaji
	
	local headword_template = find_headword_template(wikitext)
	if headword_template then
		for glob in mw.text.gsplit(headword_template:gsub('%}%}', '|'), '|') do
			if match(glob, '^[%s%.%-%^' .. kana_pattern .. '、]+$') then
				kana = glob; break
			end
		end
		romaji = headword_template:match("%|rom%=([^%|%}]+)")
	end
	
	if not kana and match(title, '^[' .. kana_pattern .. '、]+$') then
		kana = title
	end
	
	if not kana then
		if _wikitext then
			return nil
		else
			local main_entry = wikitext:match('{{ja%-see|(.-)[|}]')
			if main_entry then
				local new_wikitext = export.get_etym_section(main_entry, title)
				return export.fetch_reading(main_entry, new_wikitext:gsub('===ရင်းမြစ် %d+===', ''))
			else
				return nil
			end
		end
	end
	if not romaji then
		if headword_template:find('ja%-verb%|') then
			kana = gsub(kana, 'う$', '.う')
		elseif headword_template:find('ja%-adj') and (headword_template:find('%|infl=i') or headword_template:find('%|infl=い') or headword_template:find('%|decl=i') or headword_template:find('%|decl=い')) then
			kana = gsub(kana, 'い$', '.い')
		elseif headword_template:find('proper') then
			kana = gsub(kana, '^', '^')
			kana = gsub(kana, ' ', ' ^')
			kana = gsub(kana, '%-', '-^')
		end
		romaji = m_ja.kana_to_romaji(kana)
	end
	
	kana = gsub(kana, '[%s%.%-%^]', '')
	return kana, romaji
	
end
]]
return export