မာတိကာသို့ ခုန်သွားရန်

မော်ဂျူး:ja-translit

ဝစ်ရှင်နရီ မှ

Documentation for this module may be created at မော်ဂျူး:ja-translit/doc

local export = {}
local data_common = mw.loadData'Module:ja-translit/data'
local c_apos = data_common.rom['っ']

local function get_data(lang_name)
	local function inspect_table(t, ...)
		for i = 1, select('#', ...) do
			if type(t) == 'table' then
				t = t[select(i, ...)]
			else return nil end
		end
		return t
	end
	if lang_name then
		local name_data = 'Module:ja-translit/data/' .. lang_name	
		if package.loaders[2](name_data) then
			local data_lang = mw.loadData(name_data)
			return function(...)
				local item_lang, item_common = data_lang[...], data_common[...]
				for i = 2, select('#', ...) do 
					local key = select(i, ...)
					if type(item_lang) == 'table' then
						item_lang = item_lang[key]
					else return inspect_table(item_common, select(i, ...)) end
					if type(item_common) == 'table' then
						item_common = item_common[key]
					else return inspect_table(item_lang, select(i + 1, ...)) end
				end
				if item_lang ~= nil then return item_lang else return item_common end
			end
		end
	end
	return function(...)
		return inspect_table(data_common[...], select(2, ...))
	end
end

function export.kana_to_romaji(text, options)
	options = options or {}
	
	local result = {[0] = ''}
	local result_sp = {}
	
	local d = get_data(options.language_name)
	
	local function getlast(i_start, predicate_good, predicate_bad)
		local in_xml = false
		for i = i_start or #result, 1, -1 do
			if in_xml then
				if result[i] == '<' then in_xml = false end
			elseif result[i] == '>' then
				in_xml = true
			else
				if (predicate_bad or function(index)
					return result_sp[index] == 'stop'
				end)(i) then break end
				if (predicate_good or function(index)
					return result[index]:len() > 0 and result_sp[index] ~= '\''
				end)(i) then return i end
			end
		end
		return 0
	end

	for c in mw.ustring.gsub(mw.ustring.gsub(text, '()([ゝヽゞヾ]+)',  function(p1, m2) -- repetition mark
		local len = mw.ustring.len(m2)
		local sec_rep = mw.ustring.sub(text, p1 - len, p1 - 1)
		for i = len, 1, -1 do
			if ({['ゞ'] = true, ['ヾ'] = true})[mw.ustring.sub(m2, i, i)] then
				sec_rep = mw.ustring.sub(sec_rep, 1, i) .. '゙' .. mw.ustring.sub(sec_rep, i + 1)
			end
		end
		return sec_rep
	end), '[ァ-ヶ]', function(m1) -- kata to hira
		return mw.ustring.char(mw.ustring.codepoint(m1) - 96)
	end):gsub("\227\130[\144-\146]゙", {
		-- convert わ゙, ゐ゙, ゑ゙, を゙ to ヷ, ヸ, ヹ, ヺ, to ensure voicing works correctly
		['ゐ゙'] = 'ヸ', ['ゑ゙'] = 'ヹ', ['を゙'] = 'ヺ',
	}):gmatch'.[\128-\191]*' do
		local rc = options.hist and d('rom_hist', c) or d('rom', c) or c
		local rc_sp = d('rom_sp', c)
		local i_last = getlast()
		
		if options.keep_period and c == '.' then rc = '.'
		elseif c:match'%a' then rc_sp = 'stop' end
		
		local repl_digraph = d('digraph', c, result[i_last])
		if repl_digraph then
			result[i_last], rc = repl_digraph, ''
			result_sp[i_last], rc_sp = nil, nil
		end
		
		if not options.hist then --はへ
			if d('flag_hahe', result_sp[i_last]) and (mw.ustring.match(c, '[-%.ー゙゚]') or  rc:match'%a' or  rc == c_apos) then
				result[i_last] = result_sp[i_last]
				result_sp[i_last] = nil
			end
			if d('flag_hahe', rc_sp) and (options.phonetic or result_sp[getlast(nil, function(i)
				return result[i]:len() > 0 and result_sp[i] ~= '\'' or result_sp[i] == 'stop'
			end, function(i) return false end)] == 'stop'  or result[i_last]:match'[-%a]' or result[i_last] == c_apos) then
				rc = rc_sp
				rc_sp = nil
			end
		end
		
		if rc:match'%a' and mw.ustring.match(result[i_last], '^[,%.?!:)Ӡ]$') then --space and punctuations
			result[i_last] = result[i_last] .. ' '
		elseif mw.ustring.match(rc, '^[(“]$') and result[i_last]:match'%a' then
			rc = ' ' .. rc
		end
		
		if rc_sp == 'voiced' then --voicing
			result[i_last] = result[i_last]:gsub('^[b-df-hj-np-tv-z]+', d('tr_voicing'))
		elseif rc_sp == 'semivoiced' then
			result[i_last] = result[i_last]:gsub('^[b-df-hj-np-tv-z]+', d('tr_semivoicing'))
		end
		
		if result[i_last] == 'n' and rc:match(options.hist and '^[aiueoyw]' or '^[aiueoy]') then --na vs n'a
			rc = c_apos .. rc
		end
		
		local r_lastlast = result[i_last]:match'^.*(%a%A*)$' --vowel clusters or stop consonants
		if r_lastlast then
			if c == 'ー' then
				result[i_last] = result[i_last] .. r_lastlast
			elseif r_lastlast:match("[aiueo]") then
				if rc:match'^%-[yw]' or options.hist and (r_lastlast == 'i' and rc:sub(1, 1) == 'y' or r_lastlast == 'u' and rc:sub(1, 1) == 'w') then
					if rc:sub(1, 1) == '-' then rc = rc:sub(2) end
					result[i_last] = result[i_last]:sub(1, -2)
					if rc:sub(1, 1) == 'y' and d('flag_postalveolarconsonant', result[i_last]) then rc = rc:sub(2) end
				elseif rc:match'^%-[aiueo]$' then
					rc = rc:sub(2)
					if r_lastlast == rc then
						result[i_last] = result[i_last] .. r_lastlast
						rc = ''
					elseif d('flag_specialconsonant', result[i_last]) then
						result[i_last] = result[i_last]:sub(1, -2)
					elseif r_lastlast == 'i' then
						result[i_last] = result[i_last]:sub(1, -2) .. 'y'
					elseif r_lastlast:match'[ou]' and rc ~= 'u' then
						result[i_last] = result[i_last]:sub(1, -2) .. 'w'
					else
						result[i_last] = result[i_last]:sub(1, -2)
					end
				elseif rc:match'^[aiueo]$' then
					if not options.hist and not options.phonetic and d('tr_long', r_lastlast .. rc) and not result[i_last]:match'[aiueo][aiueo]$' then
						result[i_last] = result[i_last] .. rc
						rc = ''
					end
				end
			end
		end
		
		table.insert(result, rc)
		result_sp[#result] = rc_sp
	end
	
	if not options.hist then --isolated はへ
		local i_last = getlast()
		if d('flag_hahe', result_sp[i_last]) and getlast(i_last - 1) == 0 then
			result[i_last] = result_sp[i_last]
		end
	end
	
	local num_cap = 0
	local has_gem = false
	for i, v in ipairs(result) do
		 --gemination
		if has_gem then
			local apos, consonant, remainder = v:match('^(' .. c_apos .. '*)([b-df-hj-np-tv-z]+)(.*)')
			if consonant then
				local c_gem = d('tr_gem', apos .. consonant) or consonant:sub(1, 1)
				v = consonant .. remainder
				local i_gem = getlast(i)
				while true do
					i_gem = getlast(i_gem - 1)
					if result_sp[i_gem] == 'gem' then
						result[i_gem] = c_gem
					else
						i_gem = getlast(i_gem + 1)
						result[i_gem] = apos .. result[i_gem]
						break
					end
				end
				has_gem = false
			end
		elseif result_sp[i] == 'gem' then
			has_gem = true
		end
		
		-- anga vs a'nga
		if v:match'^ng' then
			local i_no_gem = getlast(i - 1, function(index)
				return result[index]:len() > 0 and result_sp[index] ~= '\'' and result_sp[index] ~= 'gem'
			end)
			if mw.ustring.match(result[i_no_gem], '%a') then
				result[i_no_gem] = result[i_no_gem] .. c_apos
			end
		end
		
		--diacritics (long vowels and others)
		if not options.no_diacritics then
			v = v:gsub('[aiueo][aiueo%A]*', d('tr_long'))
		end
		
		--uppercase
		if result_sp[i] == 'cap' then num_cap = num_cap + 1 end
		if num_cap > 0 then
			v = v:gsub('.[\128-\191]*', function(c)
				if num_cap <= 0 then return c end
				local uc = mw.ustring.upper(c)
				if c ~= uc then num_cap = num_cap - 1 end
				return uc
			end)
		end
		result[i] = v
	end
	return (table.concat(result):gsub(c_apos, "&#39;"))
end

local function is_good_romaji(str)
	str = mw.ustring.gsub(str, '%A', '')
	return mw.ustring.match(str, '[^A-za-zĀĪŪĒŌāīūēō]') == nil
end

local function format_pos_romaji(rom,  pos)
	if pos == 'proper' then
		if mw.ustring.gmatch(rom, '%u') then return rom end
		return (mw.ustring.gsub(rom, '%f[%a]%a', mw.ustring.upper))
	elseif pos == 'prefix' then
		return (rom:gsub('%-?$', '-'))
	elseif pos == 'suffix' or pos == 'counter' or pos == 'classifier' then
		return (rom:gsub('^%-?', '-'))
	else
		return rom
	end
end

function export.tr(text, lang, sc)
	local options = { language_name = lang }
	local rom_result
	
	local rom_title = export.kana_to_romaji(text, options)
	if not is_good_romaji(rom_title) then rom_title = nil end
	
	local pagetext = mw.title.new(text):getContent()
	if pagetext then
		for _, tn in ipairs{'noun', 'verb', 'verb%-suru', 'adj', 'phrase', 'combining form', 'verb form', 'see'} do
			if rom_title and pagetext:match('{{ja%-' .. tn .. '}}') then
				if rom_result and rom_result ~= rom_title then return rom_title end
				rom_result = rom_title
			end
			for t in pagetext:gmatch('{{ja%-' .. tn .. '(|..-})}') do
				local no_kana = true
				for tt in t:gmatch'%f[^|]..-%f[|}]' do
					if not tt:match'%D.*=' and not tt:match'%[%[' and not tt:match']]' then
						local rom = export.kana_to_romaji(tt, options)
						if is_good_romaji(rom) then
							no_kana = false
							if rom_result and rom_result ~= rom then return rom_title end
							rom_result = rom
						end
					end
				end
				if rom_title and no_kana then
					if rom_result and rom_result ~= rom_title then return rom_title end
					rom_result = rom_title
				end
			end
		end
		
		for t in pagetext:gmatch'{{ja%-pos|(..-})}' do
			local pos, ta = t:match'^(..-)(|..-})$'
			if ta then
				local no_kana = true
				for tt in ta:gmatch'%f[^|]..-%f[|}]' do
					if not tt:match'%D.*=' and not tt:match'%[%[' and not tt:match']]' then
						local rom = export.kana_to_romaji(tt, options)
						if is_good_romaji(rom) then
							no_kana = false
							rom = format_pos_romaji(rom, pos)
							if rom_result and rom_result ~= rom then return rom_title end
							rom_result = rom
						end
					end
				end
				if rom_title and no_kana then
					local rom = format_pos_romaji(rom_title, pos)
					if rom_result and rom_result ~= rom then return rom_title end
					rom_result = rom
				end
			elseif rom_title then
				local rom = format_pos_romaji(rom_title, t:sub(1, -2))
				if rom_result and rom_result ~= rom then return rom_title end
				rom_result = rom
			end
		end
		return rom_result or rom_title
	else
		return rom_title
	end
end

-- A hack to bypass [[mod:languages]] bug [[special:diff/72585061]]
local f_tr = export.tr
function export.tr(...)
	local rom = f_tr(...)
	if rom then
		return (rom:gsub("&#39;", mw.getCurrentFrame():extensionTag('nowiki', '\'')))
	end
end

return export