မော်ဂျူး:grc-translit

Documentation for this module may be created at မော်ဂျူး:grc-translit/doc
local export = {}

local m_data = require('Module:grc-utilities/data')
local tokenize = require('Module:grc-utilities').tokenize

local sub = mw.ustring.sub
local find = mw.ustring.find
local match = mw.ustring.match
local gsub = mw.ustring.gsub
local U = mw.ustring.char

local chars = m_data.named
local acute = chars.acute
local grave = chars.grave
local circumflex = chars.circum
local diaeresis = chars.diaeresis
local smooth = chars.smooth
local rough = chars.rough
local macron = chars.macron
local breve = chars.breve
local subscript = chars.subscript

local hat = chars.Latin_circum

local tt = {
	-- Vowels
	["α"] = "a",
	["ε"] = "e",
	["η"] = "e" .. macron,
	["ι"] = "i",
	["ο"] = "o",
	["υ"] = "u",
	["ω"] = "o" .. macron,

	-- Consonants
	["β"] = "b",
	["γ"] = "g",
	["δ"] = "d",
	["ζ"] = "z",
	["θ"] = "th",
	["κ"] = "k",
	["λ"] = "l",
	["μ"] = "m",
	["ν"] = "n",
	["ξ"] = "x",
	["π"] = "p",
	["ρ"] = "r",
	["σ"] = "s",
	["ς"] = "s",
	["τ"] = "t",
	["φ"] = "ph",
	["χ"] = "kh",
	["ψ"] = "ps",
	
	-- Archaic letters
	["ϝ"] = "w",
	["ϻ"] = "ś",
	["ϙ"] = "q",
	["ϡ"] = "š",
	["ͷ"] = "v",
	
	-- Diacritics
	-- unchanged: macron, grave, acute
	[breve] = '',
	[smooth] = '',
	[rough] = '',
	[circumflex] = hat,
	[subscript] = 'i',
}

function export.tr(text, lang, sc)
	-- If the script is given as Cprt, then forward the transliteration to that module.
	-- This should not be necessary, as [[Module:translit-redirect]] redirects
	-- to this module only if script is polytonic.
	if sc == "Cprt" then
		-- [[Special:WhatLinksHere/Template:tracking/grc-translit/Cprt]]
		require('Module:debug').track('grc-translit/Cprt')
		return require('Module:Cprt-translit').tr(text, lang, sc)
	end
	
	if text == '῾' then
		return 'h'
	end
	
	--[[
		Replace semicolon or Greek question mark with regular question mark,
		except after an ASCII alphanumeric character (to avoid converting
		semicolons in HTML entities).
	]]
	text = gsub(text, "([^A-Za-z0-9])[;" .. U(0x37E) .. "]", "%1?")
	
	-- Handle the middle dot. It is equivalent to semicolon or colon, but semicolon is probably more common.
	text = gsub(text, "·", ";")
	
	local tokens = tokenize(text)

	--now read the tokens
	local output = {}
	for i, token in pairs(tokens) do
		-- substitute each character in the token for its transliteration
		local translit = gsub(
			mw.ustring.lower(token),
			'.',
			tt
		)
		
		local next_token = tokens[i + 1]
		
		if token == 'γ' and next_token and match(next_token, '[κγχξ]') then
			-- γ before a velar should be <n>
			translit = 'n'
		elseif token == 'ρ' and tokens[i - 1] == 'ρ' then
			-- ρ after ρ should be <rh>
			translit = 'rh'
		elseif match(token, '^[αΑ].*' .. subscript .. '$') then
			-- add macron to ᾳ
			translit = gsub(translit, '([aA])', '%1' .. macron)
		end
		
		if match(token, rough) then
			if match(token, '[Ρρ]') then
				translit = translit .. 'h'
			else -- vowel
				translit = 'h' .. translit
			end
		end
		
		-- Remove macron from a vowel that has a circumflex.
		if mw.ustring.match(translit, macron .. '[' .. rough .. smooth .. ']?' .. hat) then
			translit = mw.ustring.gsub(translit, macron, '')
		end
		
		-- Capitalize first character of transliteration.
		if token ~= mw.ustring.lower(token) then
			translit = gsub(translit, "^.", mw.ustring.upper)
		end
		
		table.insert(output, translit)
	end
	output = table.concat(output)
	
	output = mw.ustring.toNFC(output)
	
	return output
end

return export