Module:okm-translit
Jump to navigation
Jump to search
- The following documentation is generated by Module:documentation/functions/translit. [edit]
- Useful links: subpage list • links • transclusions • testcases • sandbox
This module will transliterate Middle Korean language text. It is also used to transliterate Early Modern Korean.
The module should preferably not be called directly from templates or other modules.
To use it from a template, use {{xlit}}
.
Within a module, use Module:languages#Language:transliterate.
For testcases, see Module:okm-translit/testcases.
Functions
tr(text, lang, sc)
- Transliterates a given piece of
text
written in the script specified by the codesc
, and language specified by the codelang
. - When the transliteration fails, returns
nil
.
local export = {}
local gsub = mw.ustring.gsub
local chars_Hani = require('Module:scripts').getByCode('Hani'):getCharacters()
local chars_Hang = require('Module:scripts').getByCode('Hang'):getCharacters()
-- https://github.com/szc126/rime-slg-korean/blob/main/slg_break_jamo.yaml
-- https://github.com/szc126/rime-slg-korean/blob/main/soolegi_yethangeul.custom.yaml
local tt_complex = {
['ᄢ']='ᄇᄉᄀ',
['ᄣ']='ᄇᄉᄃ',
['ᄤ']='ᄇᄉᄇ',
['ᄥ']='ᄇᄉᄉ',
['ᄦ']='ᄇᄉᄌ',
['ᄳ']='ᄉᄇᄀ',
['ᄴ']='ᄉᄉᄉ',
['ꥥ']='ᄅᄀᄀ',
['ꥧ']='ᄅᄃᄃ',
['ꥪ']='ᄅᄇᄇ',
['ꥲ']='ᄇᄉᄐ',
['ꥵ']='ᄉᄉᄇ',
['ꥸ']='ᄌᄌᄒ',
['ᄁ']='ᄀᄀ',
['ᄄ']='ᄃᄃ',
['ᄈ']='ᄇᄇ',
['ᄊ']='ᄉᄉ',
['ᄍ']='ᄌᄌ',
['ᄓ']='ᄂᄀ',
['ᄔ']='ᄂᄂ',
['ᄕ']='ᄂᄃ',
['ᄖ']='ᄂᄇ',
['ᄗ']='ᄃᄀ',
['ᄘ']='ᄅᄂ',
['ᄙ']='ᄅᄅ',
['ᄚ']='ᄅᄒ',
['ᄜ']='ᄆᄇ',
['ᄞ']='ᄇᄀ',
['ᄟ']='ᄇᄂ',
['ᄠ']='ᄇᄃ',
['ᄡ']='ᄇᄉ',
['ᄧ']='ᄇᄌ',
['ᄨ']='ᄇᄎ',
['ᄩ']='ᄇᄐ',
['ᄪ']='ᄇᄑ',
['ᄬ']='ᄫᄫ',
['ᄭ']='ᄉᄀ',
['ᄮ']='ᄉᄂ',
['ᄯ']='ᄉᄃ',
['ᄰ']='ᄉᄅ',
['ᄱ']='ᄉᄆ',
['ᄲ']='ᄉᄇ',
['ᄵ']='ᄉᄋ',
['ᄶ']='ᄉᄌ',
['ᄷ']='ᄉᄎ',
['ᄸ']='ᄉᄏ',
['ᄹ']='ᄉᄐ',
['ᄺ']='ᄉᄑ',
['ᄻ']='ᄉᄒ',
['ᄽ']='ᄼᄼ',
['ᄿ']='ᄾᄾ',
['ᅁ']='ᄋᄀ',
['ᅂ']='ᄋᄃ',
['ᅃ']='ᄋᄆ',
['ᅄ']='ᄋᄇ',
['ᅅ']='ᄋᄉ',
['ᅆ']='ᄋᅀ',
['ᅇ']='ᄋᄋ',
['ᅈ']='ᄋᄌ',
['ᅉ']='ᄋᄎ',
['ᅊ']='ᄋᄐ',
['ᅋ']='ᄋᄑ',
['ᅍ']='ᄌᄋ',
['ᅏ']='ᅎᅎ',
['ᅑ']='ᅐᅐ',
['ᅒ']='ᄎᄏ',
['ᅓ']='ᄎᄒ',
['ᅖ']='ᄑᄇ',
['ᅘ']='ᄒᄒ',
['ᅚ']='ᄀᄃ',
['ᅛ']='ᄂᄉ',
['ᅜ']='ᄂᄌ',
['ᅝ']='ᄂᄒ',
['ᅞ']='ᄃᄅ',
['ꥠ']='ᄃᄆ',
['ꥡ']='ᄃᄇ',
['ꥢ']='ᄃᄉ',
['ꥣ']='ᄃᄌ',
['ꥤ']='ᄅᄀ',
['ꥦ']='ᄅᄃ',
['ꥨ']='ᄅᄆ',
['ꥩ']='ᄅᄇ',
['ꥫ']='ᄅᄫ',
['ꥬ']='ᄅᄉ',
['ꥭ']='ᄅᄌ',
['ꥮ']='ᄅᄏ',
['ꥯ']='ᄆᄀ',
['ꥰ']='ᄆᄃ',
['ꥱ']='ᄆᄉ',
['ꥳ']='ᄇᄏ',
['ꥴ']='ᄇᄒ',
['ꥶ']='ᄋᄅ',
['ꥷ']='ᄋᄒ',
['ꥹ']='ᄐᄐ',
['ꥺ']='ᄑᄒ',
['ꥻ']='ᄒᄉ',
['ꥼ']='ᅙᅙ',
['ᆅ']='@ᅩ@ᅡ@',
['ᆒ']='@ᅮ@ᅥ@',
['ᅹ']='@ᅡ@ᅩ',
['ᆄ']='@ᅩ@ᅡ',
['ᆆ']='@ᅩ@ᅥ',
['ᆑ']='@ᅮ@ᅥ',
['ᆥ']='@ᅥ@ᅡ',
['ᆐ']='@ᅮᅥ@',
['ힳ']='@ᅩᅡ@',
['ힷ']='@ᅮᅡ@',
['ᆁ']='ᅩ@ᅥ@',
['ᆌ']='ᅮ@ᅥ@',
['ᆧ']='ᅩ@ᅡ@',
['ힽ']='ᅵ@ᅡᅩ',
['ힾ']='ᅵ@ᅡ@',
['ퟀ']='ᅵ@ᅥ@',
['ᅤ']='@ᅡ@',
['ᅨ']='@ᅥ@',
['ᅸ']='@ᅡᅩ',
['ᅽ']='@ᅥᅩ',
['ᅾ']='@ᅥᅮ',
['ᆇ']='@ᅩᅩ',
['ᆈ']='@ᅩ@',
['ᆎ']='@ᅮᅡ',
['ᆏ']='@ᅮᅥ',
['ᆓ']='@ᅮᅮ',
['ᆔ']='@ᅮ@',
['ᆤ']='@ᅡᅮ',
['ힲ']='@ᅩᅡ',
['ힴ']='@ᅩᅥ',
['ힸ']='@ᅮᅩ',
['ᆙ']='ᅵ@ᅡ',
['ᆦ']='ᅩ@ᅡ',
['ힰ']='ᅩ@ᅥ',
['ힵ']='ᅮ@ᅥ',
['ힿ']='ᅵ@ᅥ',
['ퟂ']='ᅵ@ᅩ',
['ퟃ']='ᅵ@ᅮ',
['ᅫ']='ᅩᅡ@',
['ᅰ']='ᅮᅥ@',
['ᆀ']='ᅩᅥ@',
['ᆊ']='ᅮᅡ@',
['ᆋ']='ᅮᅥᅳ',
['ᆗ']='ᅳᅵᅮ',
['ힱ']='ᅩᅩᅵ',
['ힶ']='ᅮᅵ@',
['ힻ']='ᅳᅥ@',
['ퟁ']='ᅵᅩᅵ',
['ퟆ']='ᆞᅥ@',
['ᅣ']='@ᅡ',
['ᅧ']='@ᅥ',
['ᅭ']='@ᅩ',
['ᅲ']='@ᅮ',
['ᅢ']='ᅡ@',
['ᅦ']='ᅥ@',
['ᅪ']='ᅩᅡ',
['ᅬ']='ᅩ@',
['ᅯ']='ᅮᅥ',
['ᅱ']='ᅮ@',
['ᅴ']='ᅳ@',
['ᅶ']='ᅡᅩ',
['ᅷ']='ᅡᅮ',
['ᅺ']='ᅥᅩ',
['ᅻ']='ᅥᅮ',
['ᅼ']='ᅥᅳ',
['ᅿ']='ᅩᅥ',
['ᆂ']='ᅩᅩ',
['ᆃ']='ᅩᅮ',
['ᆉ']='ᅮᅡ',
['ᆍ']='ᅮᅮ',
['ᆕ']='ᅳᅮ',
['ᆖ']='ᅳᅳ',
['ᆘ']='ᅵᅡ',
['ᆚ']='ᅵᅩ',
['ᆛ']='ᅵᅮ',
['ᆜ']='ᅵᅳ',
['ᆝ']='ᅵᆞ',
['ᆟ']='ᆞᅥ',
['ᆠ']='ᆞᅮ',
['ᆡ']='ᆞ@',
['ᆢ']='ᆞᆞ',
['ᆣ']='ᅡᅳ',
['ힹ']='ᅳᅡ',
['ힺ']='ᅳᅥ',
['ힼ']='ᅳᅩ',
['ퟄ']='ᅵ@',
['ퟅ']='ᆞᅡ',
['ᇄ']='ᆨᆺᆨ',
['ᇌ']='ᆯᆨᆺ',
['ᇏ']='ᆯᆮᇂ',
['ᇑ']='ᆯᆷᆨ',
['ᇒ']='ᆯᆷᆺ',
['ᇓ']='ᆯᆸᆺ',
['ᇔ']='ᆯᆸᇂ',
['ᇖ']='ᆯᆺᆺ',
['ᇞ']='ᆷᆺᆺ',
['ᇭ']='ᇰᆨᆨ',
['ퟎ']='ᆮᆮᆸ',
['ퟑ']='ᆮᆺᆨ',
['ퟕ']='ᆯᆨᆨ',
['ퟖ']='ᆯᆨᇂ',
['ퟗ']='ᆯᆯᆿ',
['ퟘ']='ᆯᆷᇂ',
['ퟙ']='ᆯᆸᆮ',
['ퟚ']='ᆯᆸᇁ',
['ퟜ']='ᆯᇹᇂ',
['ퟟ']='ᆷᆫᆫ',
['ퟡ']='ᆷᆸᆺ',
['ퟤ']='ᆸᆯᇁ',
['ퟧ']='ᆸᆺᆮ',
['ퟬ']='ᆺᆺᆨ',
['ퟭ']='ᆺᆺᆮ',
['ퟸ']='ᆽᆸᆸ',
['ᆩ']='ᆨᆨ',
['ᆪ']='ᆨᆺ',
['ᆬ']='ᆫᆽ',
['ᆭ']='ᆫᇂ',
['ᆰ']='ᆯᆨ',
['ᆱ']='ᆯᆷ',
['ᆲ']='ᆯᆸ',
['ᆳ']='ᆯᆺ',
['ᆴ']='ᆯᇀ',
['ᆵ']='ᆯᇁ',
['ᆶ']='ᆯᇂ',
['ᆹ']='ᆸᆺ',
['ᆻ']='ᆺᆺ',
['ᇃ']='ᆨᆯ',
['ᇅ']='ᆫᆨ',
['ᇆ']='ᆫᆮ',
['ᇇ']='ᆫᆺ',
['ᇈ']='ᆫᇫ',
['ᇉ']='ᆫᇀ',
['ᇊ']='ᆮᆨ',
['ᇋ']='ᆮᆯ',
['ᇍ']='ᆯᆫ',
['ᇎ']='ᆯᆮ',
['ᇐ']='ᆯᆯ',
['ᇕ']='ᆯᇦ',
['ᇗ']='ᆯᇫ',
['ᇘ']='ᆯᆿ',
['ᇙ']='ᆯᇹ',
['ᇚ']='ᆷᆨ',
['ᇛ']='ᆷᆯ',
['ᇜ']='ᆷᆸ',
['ᇝ']='ᆷᆺ',
['ᇟ']='ᆷᇫ',
['ᇠ']='ᆷᆾ',
['ᇡ']='ᆷᇂ',
['ᇣ']='ᆸᆯ',
['ᇤ']='ᆸᇁ',
['ᇥ']='ᆸᇂ',
['ᇧ']='ᆺᆨ',
['ᇨ']='ᆺᆮ',
['ᇩ']='ᆺᆯ',
['ᇪ']='ᆺᆸ',
['ᇬ']='ᇰᆨ',
['ᇮ']='ᇰᇰ',
['ᇯ']='ᇰᆿ',
['ᇱ']='ᇰᆺ',
['ᇲ']='ᇰᇫ',
['ᇳ']='ᇁᆸ',
['ᇵ']='ᇂᆫ',
['ᇶ']='ᇂᆯ',
['ᇷ']='ᇂᆷ',
['ᇸ']='ᇂᆸ',
['ᇺ']='ᆨᆫ',
['ᇻ']='ᆨᆸ',
['ᇼ']='ᆨᆾ',
['ᇽ']='ᆨᆿ',
['ᇾ']='ᆨᇂ',
['ᇿ']='ᆫᆫ',
['ퟋ']='ᆫᆯ',
['ퟌ']='ᆫᆾ',
['ퟍ']='ᆮᆮ',
['ퟏ']='ᆮᆸ',
['ퟐ']='ᆮᆺ',
['ퟒ']='ᆮᆽ',
['ퟓ']='ᆮᆾ',
['ퟔ']='ᆮᇀ',
['ퟛ']='ᆯᇰ',
['ퟞ']='ᆷᆫ',
['ퟠ']='ᆷᆷ',
['ퟢ']='ᆷᆽ',
['ퟣ']='ᆸᆮ',
['ퟥ']='ᆸᆷ',
['ퟦ']='ᆸᆸ',
['ퟨ']='ᆸᆽ',
['ퟩ']='ᆸᆾ',
['ퟪ']='ᆺᆷ',
['ퟫ']='ᆺᇦ',
['ퟮ']='ᆺᇫ',
['ퟯ']='ᆺᆽ',
['ퟰ']='ᆺᆾ',
['ퟱ']='ᆺᇀ',
['ퟲ']='ᆺᇂ',
['ퟳ']='ᇫᆸ',
['ퟴ']='ᇫᇦ',
['ퟵ']='ᇰᆷ',
['ퟶ']='ᇰᇂ',
['ퟷ']='ᆽᆸ',
['ퟹ']='ᆽᆽ',
['ퟺ']='ᇁᆺ',
['ퟻ']='ᇁᇀ',
-- compatibility jamo
['ㅩ']='ᄅᄀᄉ',
['ㅫ']='ᄅᄇᄉ',
['ㅴ']='ᄇᄉᄀ',
['ㅵ']='ᄇᄉᄃ',
['ㄲ']='ᄀᄀ',
['ㄸ']='ᄃᄃ',
['ㅃ']='ᄇᄇ',
['ㄳ']='ᄀᄉ',
['ㄵ']='ᄂᄌ',
['ㄶ']='ᄂᄒ',
['ㄺ']='ᄅᄀ',
['ㄻ']='ᄅᄆ',
['ㄼ']='ᄅᄇ',
['ㄽ']='ᄅᄉ',
['ㄾ']='ᄅᄐ',
['ㄿ']='ᄅᄑ',
['ㅀ']='ᄅᄒ',
['ㅄ']='ᄇᄉ',
['ㅆ']='ᄉᄉ',
['ㅉ']='ᄌᄌ',
['ㅥ']='ᄂᄂ',
['ㅦ']='ᄂᄃ',
['ㅧ']='ᄂᄉ',
['ㅨ']='ᄂᅀ',
['ㅪ']='ᄅᄃ',
['ㅬ']='ᄅᅀ',
['ㅭ']='ᄅᅙ',
['ㅮ']='ᄆᄇ',
['ㅯ']='ᄆᄉ',
['ㅰ']='ᄆᅀ',
['ㅲ']='ᄇᄀ',
['ㅳ']='ᄇᄃ',
['ㅶ']='ᄇᄌ',
['ㅷ']='ᄇᄐ',
['ㅹ']='ᄫᄫ',
['ㅺ']='ᄉᄀ',
['ㅻ']='ᄉᄂ',
['ㅼ']='ᄉᄃ',
['ㅽ']='ᄉᄇ',
['ㅾ']='ᄉᄌ',
['ㆀ']='ᄋᄋ',
['ㆂ']='ᅌᄉ',
['ㆃ']='ᅌᅀ',
['ㆅ']='ᄒᄒ',
['ㄱ']='ᄀ',
['ㄴ']='ᄂ',
['ㄷ']='ᄃ',
['ㄹ']='ᄅ',
['ㅁ']='ᄆ',
['ㅂ']='ᄇ',
['ㅅ']='ᄉ',
['ㅇ']='ᄋ',
['ㅈ']='ᄌ',
['ㅊ']='ᄎ',
['ㅋ']='ᄏ',
['ㅌ']='ᄐ',
['ㅍ']='ᄑ',
['ㅎ']='ᄒ',
['ㅤ']='ᅟ', -- filler
['ㅱ']='ᄝ',
['ㅸ']='ᄫ',
['ㅿ']='ᅀ',
['ㆁ']='ᅌ',
['ㆄ']='ᅗ',
['ㆆ']='ᅙ',
['ㆈ']='@ᅩ@ᅡᅵ',
['ㆋ']='@ᅮ@ᅥᅵ',
['ㆇ']='@ᅩ@ᅡ',
['ㆊ']='@ᅮ@ᅥ',
['ㅒ']='@ᅡᅵ',
['ㅖ']='@ᅥᅵ',
['ㅙ']='ᅩᅡᅵ',
['ㅞ']='ᅮᅥᅵ',
['ㆉ']='@ᅩᅵ',
['ㆌ']='@ᅮᅵ',
['ㅐ']='ᅡᅵ',
['ㅑ']='@ᅡ',
['ㅔ']='ᅥᅵ',
['ㅕ']='@ᅥ',
['ㅘ']='ᅩᅡ',
['ㅚ']='ᅩᅵ',
['ㅛ']='@ᅩ',
['ㅝ']='ᅮᅥ',
['ㅟ']='ᅮᅵ',
['ㅠ']='@ᅮ',
['ㅢ']='ᅳᅵ',
['ㅏ']='ᅡ',
['ㅓ']='ᅥ',
['ㅗ']='ᅩ',
['ㅜ']='ᅮ',
['ㅡ']='ᅳ',
['ㅣ']='ᅵ',
['ㆍ']='ᆞ',
}
local tt = [==[
BREAK 1
# remove hanja from (ex.) 사뎐(辭典)
# caps prob. isn't necessary since the "base" text is actually hangeul?
# Hani regex is a reasonable subset of Hani from [[Module:scripts/data]],
# last checked on 20220221
%([一-鿿㐀-䶿𠀀-𰀀-]+%) ×
# to yale
# non-simple
gᄋ Ğ # voiced velar fricative /ɣ/
ᄋᄋ Ő
@ᅮ yu
@ᅩ yo
ᅩᅡ wa
ᅮᅥ we
ᅵᆞ yo
ᆞᆞ yo
# choseong
ᄀ K
ᄂ N
ᄃ T
ᄅ L
ᄆ M
ᄇ P
ᄉ S
ᄋ Ø
ᄌ C
ᄎ CH
ᄏ KH
ᄐ TH
ᄑ PH
ᄒ H
ᄝ ◆
ᄫ Ƃ
ᅗ ◆
ᄛ ◆
ᅌ Ŋ
ᅀ Z
ᅙ Q
ᄼ ◆
ᅎ ◆
ᅔ ◆
ᄾ ◆
ᅐ ◆
ᅕ ◆
ᅟ × # filler
# jungseong
@ y
ᅡ a
ᅥ e
ᅩ wo
ᅮ wu
ᅳ u
ᅵ i
ᆞ o
ᅠ × # filler
# jongseong
ᆨ k
ᆫ n
ᆮ t
ᆯ l
ᆷ m
ᆸ p
ᆺ s
ᆼ ø
ᆽ c
ᆾ ch
ᆿ kh
ᇀ th
ᇁ ph
ᇂ h
ᇢ ◆
ᇦ ƃ
ᇴ ◆
ퟝ ◆
ᇰ ŋ
ᇫ z
ᇹ q
# tone
〮 ↑
〯 →
# tone diacritic location
([aiueo]+)([y]?)([↑→↓]) %1%3%2
# hyphens within syllables
# CV-y
# CVC-C
# CV-C
# C-V
%-%-%-%-(.-[wyaiueo↑→↓]+)(y) %1-%2
%-%-%-(.-[wyaiueo↑→↓]+[^wyaiueo ])([^wyaiueo ]) %1-%2
%-%-%-(.-[wyaiueo↑→↓]+) %1-
%-%-(.-)([wyaiueo]) %1-%2
# 子(ᄌᆞ)ㅣ
(%))(%-?)i %1%2y
Ø ×
BREAK 2
↑ ́
→ ̌
↓ ̀
ğ G
ő OO
Ø NG # capitalized hanja readings
ø ng
ƃ W
Ŋ NG # capitalized hanja readings
ŋ ng
]==]
tt = mw.text.trim(tt)
tt = mw.ustring.gsub(tt, '%s*#[^\n]+', '') -- remove comments
tt = mw.ustring.gsub(tt, '\n+', '\n') -- remove empty lines
local a, b, c, d = 'ᄀᄂᄃᄅᄆᄇᄉᄋᄌᄎᄏᄐᄑᄒᄝᄫᅗᄛᅌᅀᅙᄼᅎᅔᄾᅐᅕᅟ', '@ᅡᅥᅩᅮᅳᅵᆞᅠ', 'ᆨᆫᆮᆯᆷᆸᆺᆼᆽᆾᆿᇀᇁᇂᇢᇦᇴퟝᇰᇫᇹ', '〮〯'
function export.tr(text, lang, sc)
text = gsub(text, "%<%/?r[pt]%>", "")
text = gsub(text, "%<%/?ruby%>", "")
if not mw.ustring.match(text, '[' .. chars_Hang .. ']') then
return nil
end
local bool_tone_marking = mw.ustring.find(text, ('[%s]'):format(d))
text = mw.ustring.toNFD(text)
text = mw.ustring.gsub(text, '.', tt_complex)
for line in mw.text.gsplit(tt, '\n') do
local _, __, pattern, repl = mw.ustring.find(line, '(.+)\t(.+)')
if pattern .. repl == 'BREAK1' then
-- add period between hanja readings
text = mw.ustring.gsub(text, '([' .. chars_Hani .. '])%((.-)%)', function(hanja, reading)
return hanja .. '(' .. mw.ustring.gsub(reading, ('([%s]+)'):format(a), '.%1') .. ')'
end)
if bool_tone_marking then
-- move the location of tone marks for easier handling and
-- mark low tone
text = mw.ustring.gsub(text, ('([%s]+)([%s]+)([%s]*)([%s]*)'):format(a, b, c, d), function(a, b, c, d)
return a .. b .. (d == '' and '↓' or d) .. (c == '' and '' or c)
end)
end
elseif pattern .. repl == 'BREAK2' then
text = mw.ustring.lower(text)
-- hanja readings
-- ref. [[Module:Ethi-translit]]
text = mw.ustring.gsub(text, '()([' .. chars_Hani .. ']+)%((.-)%)()', function(start_pos, hanja, reading, end_pos)
-- treat final ieung as null if tones are marked (is this a safe assumption?)
if bool_tone_marking then
reading = mw.ustring.gsub(reading, 'ø', '')
end
-- convert to uppercase
reading = mw.ustring.upper(reading)
return reading
end)
-- remove hanja reading leading period
text = mw.ustring.gsub(text, '^%.', '')
text = mw.ustring.gsub(text, "'''%.", "'''")
text = mw.ustring.gsub(text, '(%s)%.', '%1')
else
if repl == '×' then
repl = ''
end
text = mw.ustring.gsub(text, pattern, repl)
end
end
-- track failed romanizations
-- (black diamond instead of U+FFFD to avoid warnings when saving this page)
if mw.ustring.match(text, '◆') then
require('Module:debug').track('okm-translit/failed romanization')
end
return text
end
return export