မော်ဂျူး:uk-pronunciation

Documentation for this module may be created at မော်ဂျူး:uk-pronunciation/doc
local export = {}

local m_IPA = require("Module:IPA")

local uk = require("Module:languages").getByCode("uk")

local gsub = mw.ustring.gsub
local U = mw.ustring.char
local acute = U(0x301)
local grave = U(0x300)

function export.pronunciation(word, accent, output)
	if type(word) == "table" then
		word, accent, output =
			word.args[1] or word:getParent().args[1],
			word.args.accent or word:getParent().args.accent,
			word.args.output or word:getParent().args.output
	end
	
	if not word or (word == "") then
		word = mw.title.getCurrentTitle().text
	end
	
	--	Returns an error if the word contains alphabetic characters that are not Cyrillic.
	require("Module:script utilities").checkScript(word, "Cyrl")
	
	word = mw.ustring.lower(word)
	
	local needsAccent = false
	
	if accent ~= "off" and not mw.ustring.find(word, "[" .. acute .. grave .. "]") then
		if require("Module:string").count(word, "[аеєиіїоуюя]") > 1 then
			needsAccent = true
		end
	end

	
	local phonetic_chars_map = {
	
		-- single characters that map to IPA sounds; these are processed last
		[3] = {
			["а"] = "ɑ",	["б"] = "b",	["в"] = "ʋ",	["г"] = "ɦ",	["ґ"] = "ɡ", 
			["д"] = "d",	["е"] = "ɛ",	["є"] = "jɛ",	["ж"] = "ʒᵊ",	["з"] = "z", 
			["и"] = "ɪ",	["і"] = "i",	["ї"] = "ji",	["й"] = "j",	["к"] = "k", 
			["л"] = "l",	["м"] = "m",	["н"] = "n",	["о"] = "ɔ",	["п"] = "p", 
			["р"] = "r",	["с"] = "s",	["т"] = "t",	["у"] = "u",	["ф"] = "f", 
			["х"] = "x",	["ц"] = "t͡s",	["ч"] = "t͡ʃᵊ",	["ш"] = "ʃᵊ",	["щ"] = "ʃᵊt͡ʃᵊ", 
			["ь"] = "ʲ",	["ю"] = "ju",	["я"] = "jɑ",	["’"] = "j",
			-- accented vowels
			[acute] =  "ˈ", [grave] =  "ˈ",
		},
	
		-- character sequences of two that map to IPA sounds
		[2] = {
			["дж"] = "d͡ʒᵊ",	["дз"] = "d͡z",
		-- Dental plosives assimilate to following hissing/hushing consonants, which is not noted in the spelling.
			["дс"] = "d͡zs",   ["дш"] = "d͡ʒᵊʃᵊ",   ["дч"] = "d͡ʒᵊt͡ʃᵊ", ["дц"] = "d͡zt͡s",
			["тс"] = "t͡s",	["тш"] = "t͡ʃᵊʃᵊ",   ["тч"] = "t͡ʃᵊː", ["тц"] = "t͡sː", 
		},
	
		-- character sequences of three that map to IPA sounds
		[1] = {
			["дзь"] = "d͡zʲ", 
		-- Dental plosives assimilate to following hissing/hushing consonants, which is not noted in the spelling.
			["тьс"] = "t͡sʲː"
		},
	}
	
	local phonetic = word
	local orthographic_replacements = {
		-- first apply consonant cluster simplifications that always occur orthographically
		["нтськ"	] = "ньськ",
		["стськ"	] = "ськ",
		["нтст"		] = "нст",
		["стч"		] = "шч",
		["стд"		] = "зд",
		["стс"		] = "сː",
		["стськ"	] = "ськ",
		["^зш"		] = "шː",
		["зш"		] = "жш",
		["^зч"		] = "шч",
		["зч"		] = "жч",
	
		-- then long consonants that are orthographically geminated.
		["([бвгґд])%1"			] = "%1ː",
		["([^д]+)жж"			] = "%1жː", -- джж sequence encode diphonemic дж
		["([^д]+)зз"			] = "%1зː", -- дзз sequence encode diphonemic дз
		["([йклмнпрстфхцчшщ])%1"] = "%1ː",
		["дждж"					] = "джː",
		["дздз"					] = "дзː",
	}
	
	for regex, replacement in pairs(orthographic_replacements) do
		phonetic = gsub(phonetic, regex, replacement)
	end
	
	-- remap apostrophe to '!' so that it doesn't conflict with IPA stress mark
	phonetic =  gsub(phonetic, "'", "!")
	
	-- replace multiple letter sequences
	for _, replacements in ipairs(phonetic_chars_map) do
		for key, replacement in pairs(replacements) do
			phonetic = gsub(phonetic, key, replacement)
		end
	end

	-- move stress mark, added by phonetic_chars_map, before vowel
	phonetic = gsub(phonetic, "([ɑɛiɪuɔ])ˈ", "ˈ%1")
	
	-- add accent if the word is monosyllabic and not "|accent=off"
	local _, numberOfVowels  = gsub(phonetic, "[ɑɛiɪuɔ]", "")
	if (numberOfVowels == 1) and  (accent ~= "off") then
		phonetic = gsub(phonetic, "([ɑɛiɪuɔ])", "ˈ%1")
	end
	
	-- palatalizable consonants before /i/ or /j/ become palatalized
	local palatalizable = "[tdsznlrbpʋfɡmkɦxʃᵊʒᵊ]"
	phonetic = gsub(phonetic, "(" .. palatalizable .. ")([ː]?)([ˈ]?)i", "%1ʲ%2%3i")
	phonetic = gsub(phonetic, "(" .. palatalizable .. ")([ː]?)j", "%1ʲ%2")

	-- eliminate garbage sequences of [ʲːj] resulting from -тьс- cluster followed by [j]
	phonetic = gsub(phonetic, "ʲːj", "ʲː")

	-- consonant simplification: ст + ц' → [с'ц']. We do it here because of palatalization.
	-- Due to the т +ц → [ц:] rule length is present. According to Орфоепскі словник p. 13,
	-- both forms are proper, without length in normal (colloquial) speech and with length
	-- in slow speech, so we parenthesize the length as optional.
	phonetic =  gsub(phonetic, "st͡sʲ([ː]?)", "sʲt͡sʲ(%1)")
	
	-- assimilation: voiceless + voiced = voiced + voiced
	-- should /ʋ/ be included as voiced? Орфоепічний словник doesn't voice initial cluster of шв (p. 116)
	local voiced_obstruent = "[bdzʒᵊɡɦ]"
	local voicing = {
		["p"] = "b",
		["f"] = "v",
		["t"] = "d",
		["tʲ"] = "dʲ",
		["s"] = "z",
		["sʲ"] = "zʲ",
		["ʃᵊ"] = "ʒᵊ",
		["k"] = "ɡ",
		["x"] = "ɣ",
		["t͡s"] = "d͡z",
		["t͡sʲ"] = "d͡zʲ",
		["t͡ʃᵊ"] = "d͡ʒᵊ",
		["ʃᵊt͡ʃᵊ"] = "ʒᵊd͡ʒᵊ",
	}
	for voiceless, voiced in pairs(voicing) do
		phonetic = gsub(phonetic, voiceless .. "(" .. voiced_obstruent .. "+)", voiced .. "%1")
	end

	-- In the sequence of two consonants, of which the second is soft, the first is pronounced soft too
	-- unless the first consonant is a labial, namely б, п, в, ф, м.
	phonetic = gsub(phonetic, "([tdsznl])(.)ʲ", "%1ʲ%2ʲ")
	phonetic = gsub(phonetic, "([tdsznl])t͡sʲ", "%1ʲt͡sʲ")
	phonetic = gsub(phonetic, "([tdsznl])d͡zʲ", "%1ʲd͡zʲ")
	phonetic = gsub(phonetic, "t͡s(.)ʲ", "t͡sʲ%1ʲ")
	phonetic = gsub(phonetic, "d͡z(.)ʲ", "d͡zʲ%1ʲ")
	phonetic = gsub(phonetic, "d͡zt͡sʲ", "d͡zʲt͡sʲ")
	phonetic = gsub(phonetic, "t͡sd͡zʲ", "t͡sʲd͡zʲ")

	-- Hushing consonants ж, ч, ш assimilate to the following hissing consonants, giving a long hissing consonant:
	-- [ʒᵊ] + [t͡sʲ] → [zʲt͡sʲ], [t͡ʃᵊ] + [t͡sʲ] → [t͡sʲː], [ʃᵊ] + [t͡sʲ] → [sʲt͡sʲ], [ʃᵊ] + [sʲ] → [sʲː]
	phonetic = gsub(phonetic, "ʒᵊt͡sʲ", "zʲt͡sʲ")
	phonetic = gsub(phonetic, "t͡ʃᵊt͡sʲ", "t͡sʲː")
	phonetic = gsub(phonetic, "ʃᵊt͡sʲ", "sʲt͡sʲ")
	phonetic = gsub(phonetic, "ʃᵊsʲ", "sʲː")

	-- Hissing consonants before hushing consonants within a word assimilate - on зш and зч word-initially and 
	-- word-medially see above.
	-- [s] + [ʃᵊ] → [ʃᵊː],  [z] + [ʃᵊ] → [ʒʃᵊ], [z] + [t͡s] → [ʒt͡s]
	-- [z] + [d͡ʒᵊ] → [ʒd͡ʒᵊ]
	phonetic = gsub(phonetic, "zʒᵊ", "ʒᵊː")
	phonetic = gsub(phonetic, "sʃᵊ", "ʃᵊː")
	phonetic = gsub(phonetic, "zt͡s", "ʒt͡s")
	phonetic = gsub(phonetic, "zd͡ʒᵊ", "ʒd͡ʒᵊ")
	
	-- cleanup: excessive palatalization: CʲCʲCʲ → CCʲCʲ
	phonetic = gsub(phonetic, "([^ɑɛiɪuɔ]+)ʲ([^ɑɛiɪuɔ]+)ʲ([^ɑɛiɪuɔ]+)ʲ", "%1%2ʲ%3ʲ")

	-- unstressed /ɑ/ has an allophone [ɐ]
	phonetic = gsub(phonetic, "([^ˈ])ɑ", "%1ɐ")
	phonetic = gsub(phonetic, "^ɑ", "ɐ")
	-- unstressed /u/ has an allophone [ʊ]
	phonetic = gsub(phonetic, "([^ˈ])u", "%1ʊ")
	phonetic = gsub(phonetic, "^u", "ʊ")
	-- unstressed /ɔ/ has by assimilation an allophone [o] before a stressed syllable with /u/ or /i/
	phonetic = gsub(phonetic, "ɔ([bdzʒᵊɡɦmnlrpftskxʲʃ͡ᵊ]+)ˈ([uiʊ]+)", "o%1ˈ%2")
	-- one allophone [e] covers unstressed /ɛ/ and /ɪ/
	phonetic = gsub(phonetic, "([^ˈ])ɛ", "%1e")
	phonetic = gsub(phonetic, "^ɛ", "e")
	phonetic = gsub(phonetic, "([^ˈ])ɪ", "%1e")
	phonetic = gsub(phonetic, "^ɪ", "e")   

	-- /ʋ/ has an allophone [u̯] in a syllable coda
	local vowel = "[ɑɛiɪuɔɐoʊe]"
	phonetic = gsub(phonetic, "(" .. vowel .. "+)ʋ", "%1u̯")
	-- /ʋ/ has an allophone [w] before /ɔ, u/and voiced consonants (not after a vowel)
	phonetic = gsub(phonetic, "ʋ([ˈ]?)([ɔuoʊbdzʒᵊɡɦmnlr]+)", "w%1%2")
	-- /ʋ/ has an allophone [ʍ] before before voiceless consonants (not after a vowel)
	phonetic = gsub(phonetic, "ʋ([pftskxʃᵊ]+)", "ʍ%1")

	-- in a syllable-final position (i.e. the first position of a syllable coda) /j/ has an allophone [i̯]:
	local consonant = "[bdzʒᵊɡɦmnlrpftskxʃᵊʋ]"
	phonetic = gsub(phonetic, "(" .. vowel .. "+)j([ˈ]?)(" .. gsub(consonant, "ʋ", "") .. "+)", "%1i̯%2%3")
	phonetic = gsub(phonetic, "(" .. vowel .. "+)j$", "%1i̯")
	-- also at the beginning of a word before a consonant
	phonetic = gsub(phonetic, "^j(" .. gsub(consonant, "ʋ", "") .. "+)", "i̯%1")
 
	-- remove old orthographic apostrophe
	phonetic =  gsub(phonetic, "!", "")
	-- stress mark in correct place
	phonetic = gsub(phonetic, "([bdzʒᵊɡɦjʲmnlrpftskxʃᵊʋwʍː͡]+)ˈ", "ˈ%1")
	phonetic = gsub(phonetic, "([ui]̯)ˈ([ʲ]?" .. vowel .. ")", "ˈ%1%2")
	phonetic = gsub(phonetic, "ˈ(l[ʲ]?[ː]?)(" .. gsub(consonant, "l", "") .. ")", "%1ˈ%2")
	phonetic = gsub(phonetic, "ˈ(r[ʲ]?[ː]?)(" .. gsub(consonant, "r", "") .. ")", "%1ˈ%2")
	phonetic = gsub(phonetic, "ˈ(m[ʲ]?[ː]?)([bpfɦszʃᵊʋʒᵊ])", "%1ˈ%2")
	phonetic = gsub(phonetic, "ˈ(n[ʲ]?[ː]?)([dtfkɡɦlxszʃᵊʋʒᵊ])", "%1ˈ%2")
	
	if output == "template" then
		return m_IPA.format_IPA_full(uk, { { pron = "[" .. phonetic .. "]" } } ) ..
			( needsAccent and
				'<span class="error previewonly"><br>The word ' .. word .. ' contains multiple vowels, but has no accent mark, so some of the vowels may be transcribed incorrectly. Please add acute accents to mark stressed syllables.</span>' ..
				require("Module:utilities").format_categories( { "Ukrainian terms with incomplete pronunciation" }, lang)
			or "" )
	else
		return (phonetic)
	end
end

return export