User Tools

Site Tools


yivalkes:regex:pcre

Differences

This shows you the differences between two versions of the page.

Link to this comparison view

Both sides previous revisionPrevious revision
yivalkes:regex:pcre [2026/06/13 15:59] – removed - external edit (Unknown date) A User Not Logged inyivalkes:regex:pcre [2026/06/13 15:59] (current) – ↷ Page moved and renamed from yivalkes:regex to yivalkes:regex:pcre wikarai
Line 1: Line 1:
 +====== REGEX SPECIFICATION ======
  
 +Complete rule set for direct implementation. All regex in PCRE / Python re format. Apply re.IGNORECASE everywhere.
 +
 +===== Stem Functions =====
 +
 +<code python>
 +import re
 +
 +def causer_stem(spoken: str) -> str:
 +    if re.search(r'[!?()]', spoken): return ''
 +    s = re.sub(r'^-', '', spoken).lower()
 +    if s.count('-') >= 2:
 +        return re.sub(r'-([^-]+)-.*$', r'\1', s)
 +    return re.sub(r'[,-].*$', '', s)
 +
 +def median_stem(spoken: str) -> str:
 +    if re.search(r'[!?()]', spoken): return ''
 +    s = re.sub(r'^-', '', spoken).lower()
 +    m = re.match(r'(.*)-(.*)-(.*)', s)
 +    if m: return m.group(1) + m.group(3)
 +    m = re.match(r'(.*)-([^ ]*)(.*)', s)
 +    if m: return m.group(1) + m.group(2)
 +    return re.sub(r',.*$', '', s)
 +</code>
 +
 +===== Reduplicated: dedicated onset-echo table =====
 +
 +The Reduplicated column uses its own 16-pattern table. These are completely independent of the Actor/Passor PATTERNS list. Apply to the Causer stem: first match wins.
 +
 +<code python>
 +REDUP_PAIRS = [
 +    (r'^[td]h?([sz]h?)([aeiouy]*)([aeiou])',      r't\3d\1\2\3'),
 +    (r'^([bdgptkszfv])(h?)([lrsfzv])',             r'\1e\1\2\3'),
 +    (r'^([sz])(h?)([pbkgtd])',                     r's\2ez\3'),
 +    (r'^[fv]([pbkgtd])',                           r'fev\1'),
 +    (r'^[sz]([aeiou]*)([aeiou])',                  r's\2z\1\2'),
 +    (r'^[fv]([aeiou]*)([aeiou])',                  r'f\2v\1\2'),
 +    (r'^[sz]h([aeiou]*)([aeiou])',                 r'sh\2zh\1\2'),
 +    (r'^(h?)([uwo]*)([ou])',                       r'\1owo'),
 +    (r'^(h?)([iy]*)([aeoiu])',                     r'\1iya'),
 +    (r'^[pb]h?([aeiou]*)([aeiou])',                r'p\2b\1\2'),
 +    (r'^([nml])([aeiou]*)([aeiou])',               r'\1\3\1\2\3'),
 +    (r'^[kg]h?([aeiou]*)([aeiou])',                r'k\2g\1\2'),
 +    (r'^[fv]([nml])',                              r'fav\1'),
 +    (r'^[sz]([nml])',                              r'saz\1'),
 +    (r'^(h?)([ea]*)',                              r'\1ea'),
 +    (r'^',                                         r'hee'),   # catch-all
 +]
 +
 +def apply_reduplicated(causer: str) -> str:
 +    for pat, repl in REDUP_PAIRS:
 +        if re.search(pat, causer, re.IGNORECASE):
 +            return re.sub(pat, repl, causer, re.IGNORECASE)
 +    return causer
 +</code>
 +
 +===== 18 Patterns (Python regex strings) =====
 +
 +<code python>
 +PATTERNS = [
 +    r'et(t?)$',                                           # [0] B
 +    r'([ou])y$',                                          # [1] C
 +    r"(y[ou]+)([sf]h?)?$",                                # [2] D
 +    r'([w]+)[ae]+$',                                      # [3] E
 +    r'([yi]+)([aeiou]+)([bdgptkmnlrfscvzjh]+)$',         # [4] F
 +    r'([wuo]+)([aeiou]*)([bdgptkmnrfscvzjh]+)$',         # [5] G
 +    r'([bdgptkmnlrfscvzjh]+)(([aeiou])([aeiou]))([lr])$', # [6] H
 +    r'([bdgptkmnlrfscvzjh]+)([aeiou])([lr])$',           # [7] I
 +    r"([bdgptkmnlrfscvzjh']+)([ea])([ea]*)$",            # [8] J
 +    r'([bdgptkmnlrfscvzjh]+)([aeiou]*)([ouw]+)$',        # [9] K
 +    r'([bdgptkmnlrfscvzjhw]+)([aeiou]*)([iy])$',         # [10] L
 +    r'([bdgptkmnlrfscvzjh]+)([aeiou]+)([bdgptkmnrfscvzjh]+)$',  # [11] M
 +    r'()([ae])y()$',                                      # [12] N
 +    r'(y)([ae]+)()$',                                     # [13] O
 +    r"(^|[''\-])([aeoiu]+)([bpkgtd]+)$",                # [14] P
 +    r"([''\-])([aeoiu]+)([lr])$",                        # [15] Q
 +    r"([''\-])([aeoiu]+)([lr][lr])$",                    # [16] R
 +    r'$',                                                 # [17] S catch-all
 +]
 +
 +SKIP_R_INDICES = list(range(16)) + [17]   # all except [16]
 +ALL_INDICES    = list(range(18))           # Actor-There only
 +</code>
 +
 +===== Replacement Strings per Case =====
 +
 +<code python>
 +# Actor-There (uses ALL_INDICES)
 +REPL_ACTOR_THERE  = [
 +    r'et\1a', r'\1ya', r"\1'a\2", r'wawa', r'\1\2\3e', r'\1\2\3e',
 +    r'\1\2ra', r'\1\2ra', r'\1\2wa', r'\1\2wa', r'\1\2\3ya', r'\1\2\3e',
 +    r'\2ye', r"\1\2\3'a", r'\1\2\3a', r'\1\2ra', r'\1\2\3a', r'a'
 +]
 +# Actor-Hither (SKIP_R_INDICES)
 +REPL_ACTOR_HITHER = [
 +    r'et\1si', r'\1yi', r"\1'i", r'\1ey', r'\1\2\3i', r'\1\2\3i',
 +    r'\1\2li', r'\1\2li', r'\1\2yi', r'\1\2wii', r'\1\2\3yi', r'\1\2\3i',
 +    r'\2yi', r"\1\2\3'i", r'\1\2\3i', r'\1\2ri', r'\1\2\3i', r'i'
 +]
 +# Actor-Hence (SKIP_R_INDICES)
 +REPL_ACTOR_HENCE  = [
 +    r'et\1soy', r'\1iyo', r'\1yo\2', r'\1oy', r'\1\2\3oy', r'\1\2\3oy',
 +    r'\1\2loy', r'\1\2loy', r'\1\2yo', r'\1\2\3yo', r'\1\2\3yo', r'\1\2\3oy',
 +    r'\2iyo', r"\1\2\3'oy", r'\1\2\3oy', r'\1\2roy', r'\1\2\3oy', r'oy'
 +]
 +# Passor-Here (SKIP_R_INDICES)
 +REPL_PASSOR_HERE  = [
 +    r'et\1', r'uy', r'yu\2', r'wee', r'ii\3', r'u\3',
 +    r'\1\4\5', r'\1i\3', r'\1ee', r'\1u', r'\1\2i', r'\1i\3',
 +    r'ey', r'yee', r'\1e\3', r'\1el', r'\1el', r''
 +]
 +# Passor-There (SKIP_R_INDICES)
 +REPL_PASSOR_THERE = [
 +    r'ayt', r'aw', r'yaw\2', r'wea', r'ya\3', r'waw\3',
 +    r'\1\4ra', r'\1ea\3', r'\1ewa', r'\1ua', r'\1\2ay', r'\1ea\3',
 +    r'ay', r'yaw', r'\1ay\3', r'\1ear', r'\1ear', r'e'
 +]
 +# Passor-Hither (SKIP_R_INDICES)
 +REPL_PASSOR_HITHER = [
 +    r'iss', r'uiii', r'iyu\2', r'wi', r'yi\3i', r'wii\3',
 +    r'\1\4lii', r'\1elii', r'\1ey', r'\1uwi', r'\1eye', r'\1i\3i',
 +    r'eyi', r'yei', r'\1i\3i', r'\1iri', r'\1iri', r'i'
 +]
 +# Passor-Hence (SKIP_R_INDICES)
 +REPL_PASSOR_HENCE = [
 +    r'oss', r'uyu', r'uyu\2', r'wu', r'ya\3u', r'wi\3o',
 +    r'\1\5aw', r'\1\3aw', r'\1oy', r'\1oyo', r'\1yu', r'\1o\3u',
 +    r'eyu', r'yu', r'\1o\3u', r'\1oru', r'\1oru', r'u'
 +]
 +</code>
 +
 +===== Apply Functions =====
 +
 +<code python>
 +def apply_with_ai_at_7(q, ai, patterns, replacements, indices, flags=re.IGNORECASE):
 +    for idx in indices:
 +        if not patterns[idx]: continue
 +        if re.search(patterns[idx], q, flags):
 +            target = ai if idx == 7 else q
 +            return re.sub(patterns[idx], replacements[idx], target, flags=flags)
 +    return q
 +
 +def apply_passor_hence(q, ai, patterns, replacements, indices, flags=re.IGNORECASE):
 +    for idx in indices:
 +        if not patterns[idx]: continue
 +        if idx == 7:
 +            if re.search(patterns[7], ai, flags):
 +                return re.sub(patterns[7], replacements[7], ai, flags=flags)
 +        else:
 +            if re.search(patterns[idx], q, flags):
 +                return re.sub(patterns[idx], replacements[idx], q, flags=flags)
 +    return q
 +
 +def apply_case_ai(ai, patterns, replacements, indices, flags=re.IGNORECASE):
 +    for idx in indices:
 +        if not patterns[idx]: continue
 +        if re.search(patterns[idx], ai, flags):
 +            return re.sub(patterns[idx], replacements[idx], ai, flags=flags)
 +    return ai
 +
 +def apply_person(form, pairs, flags=re.IGNORECASE):
 +    for pat, repl in pairs:
 +        if re.search(pat, form, flags):
 +            return re.sub(pat, repl, form, flags=flags, count=1)
 +    return form
 +</code>
 +
 +===== Person Suffix Pairs =====
 +
 +<code python>
 +ME_PAIRS   = [("([ao]|[ae]e)$", r'\1ni'), ("ii$", 'iin'), ("[uw]$", 'win'), ("e?$", 'in')]
 +YOU_PAIRS  = [("([^aeiouwyn])$", r'\1ets'),
 +              ("([^aeiou]*[aeiou]+[^aeiou]+[aeiou]+[^aeiou]*[aeiou]+)$", r'\1ts'),
 +              ("$", 'tse')]
 +THEM_PAIRS = [("([aeou])$", r'\1rh'), ("([^i][wyi])$", r'\1irh'), ("$", 'erh')]
 +</code>
 +
 +===== Portmanteau Table =====
 +
 +<code python>
 +PORTMANTEAUS = {
 +    ('me',   'there'):  ('in$',       'inia',   'ye'),
 +    ('you',  'there'):  ('tse$',      'tsa',    'a'),
 +    ('them', 'there'):  ('e?[ei]rh$', 'earh',   'a'),
 +    ('me',   'hither'): ('in$',       'inneye', 'yi'),
 +    ('you',  'hither'): ('tse$',      'tsi',    'i'),
 +    ('them', 'hither'): ('[ei]rh$',   'eyerh',  'i'),
 +    ('me',   'hence'):  ('in$',       'inyo',   'yo'),
 +    ('you',  'hence'):  ('tse$',      'tsoy',   'oy'),
 +    ('them', 'hence'):  ('[ei]rh$',   'iyorh',  'yo'),
 +}
 +
 +def apply_portmanteau(here_person_form, person, direction, flags=re.IGNORECASE):
 +    pat, sub, fallback = PORTMANTEAUS[(person, direction)]
 +    if re.search(pat, here_person_form, flags):
 +        return re.sub(pat, sub, here_person_form, flags=flags)
 +    return here_person_form + fallback
 +</code>
 +
 +===== Causative / Present Active / Cheers =====
 +
 +<code python>
 +CAUSATIVE_ME_PAIRS   = [("([mn]*|ng)$", 'niya'), ("([aeiouwy])$", r'\1niya'), ("$", 'iniya')]
 +CAUSATIVE_YOU_PAIRS  = [("(m+|ng)$", 'ntaya'), ("([dt])+$", 'ttsaya'),
 +                         ("([aeiourlhzsyw])$", r'\1taya'), ("([pbkg])$", r'\1saya'), ("$", 'etaya')]
 +CAUSATIVE_THEM_PAIRS = [("([aeiouwyrh])r?$", r'\1rheya'), ("$", 'erheya')]
 +PRESENT_ACTIVE_PAIRS = [("oo$", 'waam'), ("[eoa]+$", 'aam'), ("([iu])$", r'\1yaam'), ("$", 'aam')]
 +CHEERS_PAIRS          = [("([aeou])[iy]$", r'\1iyets!'), ("([^aeiou])$", r'\1eyets!'),
 +                         ("[ou]+$", 'oyets!'), ("[aei]*$", 'eyets!'), ("$", 'eyets!')]
 +</code>
 +
 +===== Imperative Patterns and Replacements =====
 +
 +<code python>
 +IMP_PATTERNS = [
 +    r'^(h?([aeiouyw])|[pbvf][pb]?h?)',
 +    r'^[sz](h?)([aeiou])',
 +    r'^h?[aeiou]?(([sz])|[dt][td]?)h?',
 +    r'^h?[aeiou]?([gk][gk]?h?|[sz]h)',
 +    r'^h?[aeiou]?(([sz])|[dt][td]?)h?',
 +    r'^[m]([aeiou])',
 +    r'^h?',  # catch-all
 +]
 +
 +IMP_REPLS = {
 +    'mild':        [r'ipp\2',    r'itts\1\2',  r'itt\2',  'ikk',   r'itt\2',  r'ibb\1',  'ippe'   ],
 +    'regular':     [r'epp\2',    r'etts\1\2',  r'ett\2',  'ekk',   r'ett\2',  r'ebb\1',  'eppe'   ],
 +    'strong':      [r'app\2',    r'atts\1\2',  r'att\2',  'akk',   r'att\2',  r'abb\1',  'appe'   ],
 +    'silly_int':   [r'ayopp\2',  r'ayotts\1\2',r'ayott\2','ayokk', r'ayott\2',r'ayobb\1','ayoppe' ],
 +    'dism_int':    [r'eumb\2',    r'eundz\1\2', r'eund\2', 'eung',  r'eund\2', r'eumb\1', 'eumbe'  ],
 +    'mild_int':    [r'iyepp\2',  r'iyetts\1\2',r'iyyett\2','iyyekk',r'iyyett\2',r'iyyebb\1','iyyeppe'],
 +    'reg_int':     [r'eyapp\2',  r'eyatts\1\2',r'eyyatt\2','eyyakk',r'eyyatt\2',r'eyyabb\1','eyyappe'],
 +    'most_int':    [r'ayapp\2',  r'ayatts\1\2',r'ayyatt\2','ayyakk',r'ayyatt\2',r'ayyabb\1','ayyappe'],
 +    'silly_imp':   [r'opp\2',    r'otts\1\2',  r'ott\2',  'okk',   r'ott\2',  r'obb\1',  'oppe'   ],
 +    'dism_imp':    [r'mb\2',     r'ndz\1\2',   r'nd\2',   'ng',    r'nd\2',   r'mb\1',   'mbe'    ],
 +}
 +</code>