\name{read_translated_text}
\alias{read_translated_text}
\title{Read Annotated Sumerian Translations from Text Files}
\description{
Reads Word documents (.docx) or plain text files containing annotated
Sumerian translations and extracts sign names, grammatical types, and
meanings into a structured data frame.
}
\usage{
read_translated_text(file, mapping=NULL)
}
\arguments{
\item{file}{A character vector of file paths to .docx or text files.
Files must contain translation lines that are formatted as described below.}
\item{mapping}{A data frame containing sign-to-reading mappings with columns
\code{name}, \code{cuneiform} and \code{syllables}. If \code{NULL} (default), the package's built-in
mapping file \code{etcsl_mapping.txt} is used.}
}

\details{
\subsection{Input Format}{
The input files must contain lines starting with \code{|} in the following format:

\code{|sign_name: TYPE: meaning}

or

\code{|equation for sign_name: TYPE: meaning}

 For example:
\preformatted{
|a2-tab: S: the double amount of work performance
|me=ME: S: divine force
|AN: S: god of heaven
|na=NA: Sx->A: whose existence is bound to S
}

Lines not starting with \code{|} are ignored. Only the first entry in an equation of sign names is extracted. The following notation is suggested for grammatical types:

\itemize{
\item \code{S} for substantives and noun phrases,  (e.g., "the old man in the temple")
\item \code{V} for verbs and decorated verbs (e.g., "to go", "to bring the delivery into the temple")
\item \code{A} for adjectives, attributes and subordinate clauses that further define the subject (e.g., "who/which is weak", "whose resource for sustaining life is grain")
\item \code{Sx->A} for a symbol that transforms the preceding noun phrase into an attribute (e.g., "whose resource for sustaining life is \code{S}"). Other transformations are denoted accordingly.
\item \code{N} for numbers,
\item \code{D} for everything else.
}
}

\subsection{Processing Steps}{
\enumerate{
\item Reads text from .docx files or plain text files
\item Filters lines starting with \code{|}
\item Parses each line into sign name, type, and meaning components
\item Normalizes transliterated text by removing separators and looking up the sign names from the \code{mapping}
\item Cleans meaning field by removing content after \code{;} or \code{|} delimiters
\item Issues a warning for entries with missing type annotations
\item Excludes empty sign names from the result
}
}

}
\value{
A data frame with the following columns:
\describe{
\item{sign_name}{The normalized sign name with components separated by hyphens (e.g., \code{"A"}, \code{"AN"}, \code{"X-NA"})}
\item{type}{Grammatical type (e.g., \code{"S"}, \code{"V"}, \code{"A"}, \code{"Sx->A"})}
\item{meaning}{The translated meaning of the sign}
}
}
\note{
If any translations have missing type annotations, the function prints a
warning message listing the affected entries.
}
\seealso{
\code{\link{convert_to_dictionary}} for converting the result into a dictionary,
\code{\link{make_dictionary}} for creating a complete dictionary with
cuneiform representations and readings in a single step.
}
\examples{

# Read translations from a single text document
filename     <- system.file("extdata", "text_with_translations.txt", package = "sumer")
translations <- read_translated_text(filename)

# View the structure
head(translations)

# Filter by grammatical type
nouns <- translations[translations$type == "S", ]
nouns

#Make some custom unifications (here: removing the word "the")
translations$meaning <- gsub("\\\\bthe\\\\b", "", translations$meaning, ignore.case = TRUE)
translations$meaning <- trimws(gsub("\\\\s+", " ", translations$meaning))

# View the structure
head(translations)

#Convert the result into a dictionary
dictionary   <- convert_to_dictionary(translations)

# View the structure
head(dictionary)

}
