| 1 |
#!/bin/bash |
| 2 |
# -*- mode: sh; coding: utf-8 -*- |
| 3 |
# $Date: 2008/03/01 06:26:19 $ |
| 4 |
|
| 5 |
# Do the matching for Latin alphabet (English), but in the UTF-8 encoding (for wide characters). |
| 6 |
LC_ALL=POSIX.UTF8 sed -e "# I use the doublequote because I use ' in the scripts. |
| 7 |
# NB: take care when writing this map that it should basically be injective |
| 8 |
# (except for unimportant variants like s^' and s'^): if it's not, it means |
| 9 |
# there is a mess in your transcription system, and you should fix your trancripts. |
| 10 |
# FIXME: Terrible bug: now, '\Fut' gives '\wˣut'. This is not elegantly |
| 11 |
# avoidable in this sed script, so I'll rewrite the program in another language. |
| 12 |
############################################# |
| 13 |
# Very special combinations of symbols |
| 14 |
# (these transformations must come in the script before the other |
| 15 |
# transformation because the other are more general |
| 16 |
# and would match these special cases as well.) |
| 17 |
# Sometimes I wrote the phonetic variants of sʹ, zʹ: |
| 18 |
# (FIXME: when I should use sʹ and ś in the recoded script?) |
| 19 |
# (s^' can also represent Russian s^', as in 'es^'o'.) |
| 20 |
s|s'^|ś|g |
| 21 |
s|s^'|ś|g |
| 22 |
s|z'^|ź|g |
| 23 |
s|z^'|ź|g |
| 24 |
# possibly I could write the phonetic variant of t': |
| 25 |
s|c^'|ć|g # soft, of course |
| 26 |
s|c'^|ć|g |
| 27 |
# FIXME: How did I write an analoguous variant of d'? E.g., the word for \`Теньгушево' after a voiced? Or it is hard? |
| 28 |
################################################ |
| 29 |
# Fricatives: |
| 30 |
s|s^|š|g # fricative |
| 31 |
s|c^|č|g # affricate, pronounced hard |
| 32 |
s|z^|ž|g # fricative |
| 33 |
s|Z^|ǯ|g # affricate |
| 34 |
# At last, the simplest combinations: |
| 35 |
#s|Z|ʒ|g # affricate -- Commented out, because there seems there weren't this sound in Mordvin or Komi speech, but I wrote it by mistake. So this conversion will catch the errors. |
| 36 |
# (FIXME: What about affricates with the palatal sibilants: ź, ś? Z^'; and c^' is already present. |
| 37 |
############################################### |
| 38 |
# Vowels: |
| 39 |
s|@|ə|g # schwa (unclear vowel that can be pronounced by them with any quality) |
| 40 |
# FIXME: (Komi) try to put the Cyrillic-o-umlaut in the Cyrillic orthography; example: |
| 41 |
# казьт воспоминание казьт@д |
| 42 |
# |
| 43 |
############################################### |
| 44 |
# The voiceless: |
| 45 |
s|R|rˣ|g # the voiceless r |
| 46 |
s|L|lˣ|g # the voiceless l |
| 47 |
#s|F|wˣ|g # the voiceless w (sounds also like x, h) |
| 48 |
# -- disabled 'F' because of the '\Fut' bug. Actually, |
| 49 |
# 'W' for this sound looks more systematic in my system, so perhaps |
| 50 |
# I should have used it, shouldn't I? |
| 51 |
s|W|wˣ(?)|g # FIXME: sometimes I wrote 'W', is it the same sound? |
| 52 |
s|J|jˣ|g # the voiceless j (sounds also like x, h; soft? Perhaps, I didn't distinguish 2 sounds--soft and hard--when using this sign.) |
| 53 |
############################################## |
| 54 |
# N: |
| 55 |
s|N|ŋ|g # velar or nasalization |
| 56 |
h # Save before the destructive checks (copy pattern space to hold space). |
| 57 |
########################################## |
| 58 |
# Well-formedness of the input (is checked destructively): |
| 59 |
# - no ^'s should remain; |
| 60 |
# - other capitals should not have been used (no S^, no C^ and no plain capitals); |
| 61 |
# - more? |
| 62 |
s|\$Date.*\$|| # ignore the 'preamble' |
| 63 |
s|\\url{.*}|| # ignore filenames, which have been marked up like this: \url{Komi_rus_slovar1.pdf} |
| 64 |
/\^/ { |
| 65 |
w /dev/stderr |
| 66 |
q 56;} # (mostly GNU sed extensions) |
| 67 |
/\(^\|[^\\]\)[[:upper:]]/ { # [^\\] -- a workaround for grammatical terms, |
| 68 |
# to escape these restrictions on Latin characters: write them like \Pres, \Gen etc. |
| 69 |
w /dev/stderr |
| 70 |
q 57;} # (mostly GNU sed extensions) |
| 71 |
g # Restore after the destructive checks (copy/append hold space to pattern space). |
| 72 |
" |
| 73 |
|
| 74 |
# FIXME: A "keep-going" mode: detect all the non-well-formednesses, and accumulate all |
| 75 |
# the error codes in the exitCode (say, as bits). |
| 76 |
|
| 77 |
exitCode="$?" |
| 78 |
case "$exitCode" in |
| 79 |
56) echo $"-- error: non well-formed input (a left-over '^')!" >/dev/stderr |
| 80 |
;; |
| 81 |
57) echo $"-- error: non well-formed input (a left-over capital Latin char)!" >/dev/stderr |
| 82 |
;; |
| 83 |
esac |
| 84 |
exit "$exitCode" |
| 85 |
|
| 86 |
# (Mordvin) Check that I have marked softenss? E.g., : |
| 87 |
# |ti|t'i| |