author = {Mikhail J. Atallah and Victor Raskin and Michael Crogan
    and Christian Hempelmann and Florian Kerschbaum and Dina Mohamed
    and Sanket Naik},
  institution = {Purdue CERIAS},
  title = {Natural Language Watermarking:
    Design, Analysis, and a Proof-of-Concept Implementation},
  booktitle = {Information Hiding: Fourth International Workshop},
  editor = {Ira S. Moskowitz},
  location = {Pittsburgh, PA, USA},
  month = {April},
  year = {2001},
  publisher = {Springer},
  series = {Lecture Notes in Computer Science},
  volume = {2137},
  isbn = {3-540-42733-3},
  pages = {185--199},
  issn = {0302-9743},
  url = {http://omni.cc.purdue.edu/~vraskin/IHW.AtaRasEtAl.pdf},
  abstract = {We describe a scheme for watermarking natural language text.
    Let $n$ denote the total number of sentences of a text, $\alpha$ denote
    the number of sentences that carry watermark bits. The modifications
    that an adversary can perform (for the purpose of removing the
    watermark) are as follows:
    (i) Meaning-preserving transformations of sentences of the text (e.g.
    translation to another natural language). This cannot damage the
    (ii) Meaning-modifying transformations of sentences of the text. Each
    such transformation has probability $\leq 3 \alpha / n$ of damaing the
    (iii) Insertions of new sentences in the text. Each such insertion has
    probability $\leq 2 \alpha / n$ of damaging the watermark.
    (iv) Moving a contiguous block of sentences from one place of the text
    to another. Each block-motion has probability $\leq 3 \alpha / n$ of
    damaging the watermark.

    Our scheme is keyed, and having the key is all that is required for
    reading the watermark; it does not require knowledge of the original
    (pre-watermark)  version of the text, or knowledge of the watermark
    message. The probability of a ``false positive'', i.e. that the text
    spuriously contains any particular $w$-bit watermark, is $2^{-w}$.}
  author = {Mikhail J. Atallah and Victor Raskin and Christian F. Hempelmann
    and Mercan Karahan and Radu Sion and Umut Topkara
    and Katrina E. Triezenberg},
  institution = {Purdue CERIAS},
  title = {Natural Language Watermarking and Tamperproofing},
  booktitle = {Information Hiding: Fifth International Workshop},
  editor = {Fabien A. P. Petitcolas},
  location = {Noordwijkerhout, Netherlands},
  month = {October},
  year = {2002},
  publisher = {Springer},
  series = {Lecture Notes in Computer Science},
  volume = {2578},
  pages = {196--212},
  issn = {0302-9743},
  url = {http://omni.cc.purdue.edu/~vraskin/IHW-2002.pdf},
  abstract = {Two main results in the area of information hiding in natural
    language text are presented. A semantically-based scheme dramatically
    improves the information hiding capacity of any text through two
    (i) modifying the granularity of meaning of individual sentences, whereas
    our own previous scheme kept the granularity fixed, and
    (ii) halving the number of sentences affected by the watermark.
    No longer a ``long text, short watermark'' approach, it now makes it
    possible to watermark short texts like wire agency reports. Using both
    the above-mentioned semantic marking scheme and our previous
    syntactically-based method hides information in a way that reveals any
    non-trivial tampering with the text (while re-formatting is not considered
    to be tampering -- the problem would be solved trivially otherwise by
    hiding a hash of the text) with a probability $l - 2^{ \beta (n + 1) }$,
    $n$ being its number of sentences and $\beta$ a small positive integer
    based on the extend of co-referencing.}
  author = {Krista Bennett},
  institution = {Purdue CERIAS},
  title = {Linguistic Steganography:
    Survey, Analysis, and Robustness Concerns for Hiding Information in Text},
  number = {TR 2004-13},
  month = {May},
  year = {2004},
  url = {https://www.cerias.purdue.edu/
  abstract = {Steganography is an ancient art. With the advent of computers,
    we have vast accessible bodies of data in which to hide information, and
    increasingly sophisticated techniques with which to analyze and recover
    that information. While much of the recent research in steganography has
    been centered on hiding data in images, many of the solutions that work
    for images are more complicated when applied to natural language text as
    a cover medium. Many approaches to steganalysis attempt to detect
    statistical anomalies in cover data which predict the presence of hidden
    information. Natural language cover texts must not only pass the
    statistical muster of automatic analysis, but also the minds of human
    readers. Linguistically naive approaches to the problem use statistical
    frequency of letter combinations or random dictionary words to encode
    information. More sophisticated approaches use context-free grammars to
    generate syntactically correct cover text which mimics the syntax of
    natural text. None of these uses meaning as a basis for generation, and
    little attention is paid to the semantic cohesiveness of a whole text as
    a data point for statistical attack. This paper provides a basic
    introduction to steganography and steganalysis, with a particular focus
    on text steganography. Text-based information hiding techniques are
    discussed, providing motivation for moving toward linguistic steganography
    and steganalysis. We highlight some of the problems inherent in text
    steganography as well as issues with existing solutions, and describe
    linguistic problems with character-based, lexical, and syntactic
    approaches. Finally, the paper explores how a semantic and rhetorical
    generation approach suggests solutions for creating more believable cover
    texts, presenting some current and future issues in analysis and
    generation. The paper is intended to be both general enough that linguists
    without training in information security and computer science can
    understand the material, and specific enough that the linguistic and
    computational problems are described in adequate detail to justify the
    conclusions suggested.}
  author = {Mercan Topkara and Cuneyt M. Taskiran and Edward J. Delp},
  institution = {Purdue CERIAS},
  title = {Natural Language Watermarking},
  booktitle = {Proceedings of the SPIE International Conference on Security,
    Steganography, and Watermarking of Multimedia Contents},
  editor = {Edward J. Delp and Ping W. Wong},
  location = {San Jose, CA, USA},
  month = {January},
  year = {2005},
  volume = {5681},
  url = {http://www.cs.purdue.edu/homes/mkarahan/ei05_5681_45.pdf},
  keywords = {text watermarking, natural language processing,
    text steganography},
  abstract = {In this paper we discuss natural language watermarking, which
    uses the structure of the sentence constituents in natural language text
    in order to insert a watermark. This approach is different from techniques,
    collectively referred to as  text watermarking,  which embed information
    by modifying the appearance of text elements, such as lines, words, or
    characters. We provide a survey of the current state of the art in natural
    language watermarking and introduce terminology, techniques, and tools for
    text processing. We also examine the parallels and differences of the two
    watermarking domains and outline how techniques from the image
    watermarking domain may be applicable to the natural language watermarking
  author = {Cuneyt M. Taskiran and Umut Topkara and Mercan Topkara and
    Edward J. Delp},
  title = {Attacks on Lexical Natural Language Steganography Systems},
  booktitle = {Proceedings of the SPIE International Conference on Security,
    Steganography, and Watermarking of Multimedia Contents},
  year = {2006},
  month = {January},
  location = {San Jose, CA},
  url = {http://homes.cerias.purdue.edu/~mercan/spie06_6072-9_paper.pdf},
  abstract = {Text data forms the largest bulk of digital data that people
    encounter and exchange daily. For this reason the potential usage of text
    data as a covert channel for secret communication is an imminent concern.
    Even though information hiding into natural language text has started to
    attract great interest, there has been no study on attacks against these
    applications. In this paper we examine the robustness of lexical
    steganography systems.In this paper we used a universal steganalysis
    method based on language models and support vector machines to
    differentiate sentences modified by a lexical steganography algorithm from
    unmodified sentences. The experimental accuracy of our method on
    classification of steganographically modified sentences was 84.9 percent.
    On classification of isolated sentences we obtained a high recall rate
    whereas the precision was low.}
  author = {Mercan Topkara and  Guiseppe Riccardi and Dilek Hakkani-Tur
    and Mikhail J. Atallah},
  title = {Natural Language Watermarking: Challenges in Building
    a Practical System},
  booktitle = {Proceedings of the SPIE International Conference on Security,
    Steganography, and Watermarking of Multimedia Contents},
  year = {2006},
  month = {January},
  location = {San Jose, CA},
  url = {http://homes.cerias.purdue.edu/~mercan/spie06_6072-10_paper.pdf},
  abstract = {This paper gives an overview of the research and implementation
    challenges we encountered in building an end-to-end natural language
    processing based watermarking system. With natural language watermarking,
    we mean embedding the watermark into a text document, using the natural
    language components as the carrier, in such a way that the modifications
    are imperceptible to the readers and the embedded information is robust
    against possible attacks. Of particular interest is using the structure
    of the sentences in natural language text in order to insert the watermark.
    We evaluated the quality of the watermarked text using an objective
    evaluation metric, the BLEU score. BLEU scoring is commonly used in the
    statistical machine translation community. Our current system prototype
    achieves 0.45 BLEU score on a scale [0,1].}
  author = {Umut Topkara and Mercan Topkara and Mikhail J. Atallah},
  title = {The hiding virtues of ambiguity: quantifiably resilient watermarking
    of natural language text through synonym substitutions},
  booktitle = {MM\&Sec '06: Proceeding of the 8th workshop on Multimedia and
  year = {2006},
  isbn = {1-59593-493-6},
  pages = {164--174},
  location = {Geneva, Switzerland},
  doi = {http://doi.acm.org/10.1145/1161366.1161397},
  publisher = {ACM Press},
  address = {New York, NY, USA},
  abstract = {Information-hiding in natural language text has mainly consisted
    of carrying out approximately meaning-preserving modifications on the given
    cover text until it encodes the intended mark. A major technique for doing
    so has been synonym-substitution. In these previous schemes, synonym
    substitutions were done until the text "confessed", i.e., carried the
    intended mark message. We propose here a better way to use synonym
    substitution, one that is no longer entirely guided by the mark-insertion
    process: It is also guided by a resilience requirement, subject to a
    maximum allowed distortion constraint. Previous schemes for information
    hiding in natural language text did not use numeric quantification of the
    distortions introduced by transformations, they mainly used heuristic
    measures of quality based on conformity to a language model (and not in
    reference to the original cover text). When there are many alternatives to
    carry out a substitution on a word, we prioritize these alternatives
    according to a quantitative resilience criterion and use them in that
    order. In a nutshell, we favor the more ambiguous alternatives. In fact
    not only do we attempt to achieve the maximum ambiguity, but we want to
    simultaneously be as close as possible to the above-mentioned distortion
    limit, as that prevents the adversary from doing further transformations
    without exceeding the damage threshold; that is, we continue to modify the
    document even after the text has "confessed" to the mark, for the dual
    purpose of maximizing ambiguity while deliberately getting as close as
    possible to the distortion limit. The quantification we use makes possible
    an application of the existing information-theoretic framework, to the
    natural language domain, which has unique challenges not present in the
    image or audio domains. The resilience stems from both (i) the fact that
    the adversary does not know where the changes were made, and (ii) the fact
    that automated disambiguation is a major difficulty faced by any natural
    language processing system (what is bad news for the natural language
    processing area, is good news for our scheme's resilience). In addition to
    the above mentioned design and analysis, another contribution of this paper
    is the description of the implementation of the scheme and of the
    experimental data obtained.}
  author = {Mercan Topkara and Umut Topkara and Mikhail J. Atallah},
  title = {Words Are Not Enough: Sentence Level Natural Language
  booktitle = {Proceedings of the ACM Workshop on Content Protection and
    Security (in conjuction with ACM Multimedia)},
  year = {2006},
  month = {October},
  location = {Santa Barbara, CA}
  author = {Mercan Topkara and Umut Topkara and Mikhail J. Atallah},
  title = {Information hiding through errors: A confusing approach},
  booktitle = {Proceedings of the SPIE International Conference on Security,
    Steganography, and Watermarking of Multimedia Contents},
  year = {2007},
  month = {January},
  location = {San Jose, CA}