tempbib3.bib
@INPROCEEDINGS{Atallah2001,
author = {Mikhail J. Atallah and Victor Raskin and Michael Crogan
and Christian Hempelmann and Florian Kerschbaum and Dina Mohamed
and Sanket Naik},
institution = {Purdue CERIAS},
title = {Natural Language Watermarking:
Design, Analysis, and a Proof-of-Concept Implementation},
booktitle = {Information Hiding: Fourth International Workshop},
editor = {Ira S. Moskowitz},
location = {Pittsburgh, PA, USA},
month = {April},
year = {2001},
publisher = {Springer},
series = {Lecture Notes in Computer Science},
volume = {2137},
isbn = {3-540-42733-3},
pages = {185--199},
issn = {0302-9743},
url = {http://omni.cc.purdue.edu/~vraskin/IHW.AtaRasEtAl.pdf},
abstract = {We describe a scheme for watermarking natural language text.
Let $n$ denote the total number of sentences of a text, $\alpha$ denote
the number of sentences that carry watermark bits. The modifications
that an adversary can perform (for the purpose of removing the
watermark) are as follows:
(i) Meaning-preserving transformations of sentences of the text (e.g.
translation to another natural language). This cannot damage the
watermark.
(ii) Meaning-modifying transformations of sentences of the text. Each
such transformation has probability $\leq 3 \alpha / n$ of damaing the
watermark.
(iii) Insertions of new sentences in the text. Each such insertion has
probability $\leq 2 \alpha / n$ of damaging the watermark.
(iv) Moving a contiguous block of sentences from one place of the text
to another. Each block-motion has probability $\leq 3 \alpha / n$ of
damaging the watermark.
Our scheme is keyed, and having the key is all that is required for
reading the watermark; it does not require knowledge of the original
(pre-watermark) version of the text, or knowledge of the watermark
message. The probability of a ``false positive'', i.e. that the text
spuriously contains any particular $w$-bit watermark, is $2^{-w}$.}
}
@INPROCEEDINGS{Atallah2002,
author = {Mikhail J. Atallah and Victor Raskin and Christian F. Hempelmann
and Mercan Karahan and Radu Sion and Umut Topkara
and Katrina E. Triezenberg},
institution = {Purdue CERIAS},
title = {Natural Language Watermarking and Tamperproofing},
booktitle = {Information Hiding: Fifth International Workshop},
editor = {Fabien A. P. Petitcolas},
location = {Noordwijkerhout, Netherlands},
month = {October},
year = {2002},
publisher = {Springer},
series = {Lecture Notes in Computer Science},
volume = {2578},
pages = {196--212},
issn = {0302-9743},
url = {http://omni.cc.purdue.edu/~vraskin/IHW-2002.pdf},
abstract = {Two main results in the area of information hiding in natural
language text are presented. A semantically-based scheme dramatically
improves the information hiding capacity of any text through two
techniques:
(i) modifying the granularity of meaning of individual sentences, whereas
our own previous scheme kept the granularity fixed, and
(ii) halving the number of sentences affected by the watermark.
No longer a ``long text, short watermark'' approach, it now makes it
possible to watermark short texts like wire agency reports. Using both
the above-mentioned semantic marking scheme and our previous
syntactically-based method hides information in a way that reveals any
non-trivial tampering with the text (while re-formatting is not considered
to be tampering -- the problem would be solved trivially otherwise by
hiding a hash of the text) with a probability $l - 2^{ \beta (n + 1) }$,
$n$ being its number of sentences and $\beta$ a small positive integer
based on the extend of co-referencing.}
}
@TECHREPORT{Bennett2004,
author = {Krista Bennett},
institution = {Purdue CERIAS},
title = {Linguistic Steganography:
Survey, Analysis, and Robustness Concerns for Hiding Information in Text},
number = {TR 2004-13},
month = {May},
year = {2004},
url = {https://www.cerias.purdue.edu/
tools_and_resources/bibtex_archive/archive/2004-13.pdf,
http://omni.cc.purdue.edu/~vraskin/Semantic-Mimicking.pdf},
abstract = {Steganography is an ancient art. With the advent of computers,
we have vast accessible bodies of data in which to hide information, and
increasingly sophisticated techniques with which to analyze and recover
that information. While much of the recent research in steganography has
been centered on hiding data in images, many of the solutions that work
for images are more complicated when applied to natural language text as
a cover medium. Many approaches to steganalysis attempt to detect
statistical anomalies in cover data which predict the presence of hidden
information. Natural language cover texts must not only pass the
statistical muster of automatic analysis, but also the minds of human
readers. Linguistically naive approaches to the problem use statistical
frequency of letter combinations or random dictionary words to encode
information. More sophisticated approaches use context-free grammars to
generate syntactically correct cover text which mimics the syntax of
natural text. None of these uses meaning as a basis for generation, and
little attention is paid to the semantic cohesiveness of a whole text as
a data point for statistical attack. This paper provides a basic
introduction to steganography and steganalysis, with a particular focus
on text steganography. Text-based information hiding techniques are
discussed, providing motivation for moving toward linguistic steganography
and steganalysis. We highlight some of the problems inherent in text
steganography as well as issues with existing solutions, and describe
linguistic problems with character-based, lexical, and syntactic
approaches. Finally, the paper explores how a semantic and rhetorical
generation approach suggests solutions for creating more believable cover
texts, presenting some current and future issues in analysis and
generation. The paper is intended to be both general enough that linguists
without training in information security and computer science can
understand the material, and specific enough that the linguistic and
computational problems are described in adequate detail to justify the
conclusions suggested.}
}
@INPROCEEDINGS{Topkara2005,
author = {Mercan Topkara and Cuneyt M. Taskiran and Edward J. Delp},
institution = {Purdue CERIAS},
title = {Natural Language Watermarking},
booktitle = {Proceedings of the SPIE International Conference on Security,
Steganography, and Watermarking of Multimedia Contents},
editor = {Edward J. Delp and Ping W. Wong},
location = {San Jose, CA, USA},
month = {January},
year = {2005},
volume = {5681},
url = {http://www.cs.purdue.edu/homes/mkarahan/ei05_5681_45.pdf},
keywords = {text watermarking, natural language processing,
text steganography},
abstract = {In this paper we discuss natural language watermarking, which
uses the structure of the sentence constituents in natural language text
in order to insert a watermark. This approach is different from techniques,
collectively referred to as text watermarking, which embed information
by modifying the appearance of text elements, such as lines, words, or
characters. We provide a survey of the current state of the art in natural
language watermarking and introduce terminology, techniques, and tools for
text processing. We also examine the parallels and differences of the two
watermarking domains and outline how techniques from the image
watermarking domain may be applicable to the natural language watermarking
domain.}
}
@INPROCEEDINGS{Taskiran2006,
author = {Cuneyt M. Taskiran and Umut Topkara and Mercan Topkara and
Edward J. Delp},
title = {Attacks on Lexical Natural Language Steganography Systems},
booktitle = {Proceedings of the SPIE International Conference on Security,
Steganography, and Watermarking of Multimedia Contents},
year = {2006},
month = {January},
location = {San Jose, CA},
url = {http://homes.cerias.purdue.edu/~mercan/spie06_6072-9_paper.pdf},
abstract = {Text data forms the largest bulk of digital data that people
encounter and exchange daily. For this reason the potential usage of text
data as a covert channel for secret communication is an imminent concern.
Even though information hiding into natural language text has started to
attract great interest, there has been no study on attacks against these
applications. In this paper we examine the robustness of lexical
steganography systems.In this paper we used a universal steganalysis
method based on language models and support vector machines to
differentiate sentences modified by a lexical steganography algorithm from
unmodified sentences. The experimental accuracy of our method on
classification of steganographically modified sentences was 84.9 percent.
On classification of isolated sentences we obtained a high recall rate
whereas the precision was low.}
}
@INPROCEEDINGS{Topkara2006,
author = {Mercan Topkara and Guiseppe Riccardi and Dilek Hakkani-Tur
and Mikhail J. Atallah},
title = {Natural Language Watermarking: Challenges in Building
a Practical System},
booktitle = {Proceedings of the SPIE International Conference on Security,
Steganography, and Watermarking of Multimedia Contents},
year = {2006},
month = {January},
location = {San Jose, CA},
url = {http://homes.cerias.purdue.edu/~mercan/spie06_6072-10_paper.pdf},
abstract = {This paper gives an overview of the research and implementation
challenges we encountered in building an end-to-end natural language
processing based watermarking system. With natural language watermarking,
we mean embedding the watermark into a text document, using the natural
language components as the carrier, in such a way that the modifications
are imperceptible to the readers and the embedded information is robust
against possible attacks. Of particular interest is using the structure
of the sentences in natural language text in order to insert the watermark.
We evaluated the quality of the watermarked text using an objective
evaluation metric, the BLEU score. BLEU scoring is commonly used in the
statistical machine translation community. Our current system prototype
achieves 0.45 BLEU score on a scale [0,1].}
}
@INPROCEEDINGS{Topkara2006a,
author = {Umut Topkara and Mercan Topkara and Mikhail J. Atallah},
title = {The hiding virtues of ambiguity: quantifiably resilient watermarking
of natural language text through synonym substitutions},
booktitle = {MM\&Sec '06: Proceeding of the 8th workshop on Multimedia and
security},
year = {2006},
isbn = {1-59593-493-6},
pages = {164--174},
location = {Geneva, Switzerland},
doi = {http://doi.acm.org/10.1145/1161366.1161397},
publisher = {ACM Press},
address = {New York, NY, USA},
abstract = {Information-hiding in natural language text has mainly consisted
of carrying out approximately meaning-preserving modifications on the given
cover text until it encodes the intended mark. A major technique for doing
so has been synonym-substitution. In these previous schemes, synonym
substitutions were done until the text "confessed", i.e., carried the
intended mark message. We propose here a better way to use synonym
substitution, one that is no longer entirely guided by the mark-insertion
process: It is also guided by a resilience requirement, subject to a
maximum allowed distortion constraint. Previous schemes for information
hiding in natural language text did not use numeric quantification of the
distortions introduced by transformations, they mainly used heuristic
measures of quality based on conformity to a language model (and not in
reference to the original cover text). When there are many alternatives to
carry out a substitution on a word, we prioritize these alternatives
according to a quantitative resilience criterion and use them in that
order. In a nutshell, we favor the more ambiguous alternatives. In fact
not only do we attempt to achieve the maximum ambiguity, but we want to
simultaneously be as close as possible to the above-mentioned distortion
limit, as that prevents the adversary from doing further transformations
without exceeding the damage threshold; that is, we continue to modify the
document even after the text has "confessed" to the mark, for the dual
purpose of maximizing ambiguity while deliberately getting as close as
possible to the distortion limit. The quantification we use makes possible
an application of the existing information-theoretic framework, to the
natural language domain, which has unique challenges not present in the
image or audio domains. The resilience stems from both (i) the fact that
the adversary does not know where the changes were made, and (ii) the fact
that automated disambiguation is a major difficulty faced by any natural
language processing system (what is bad news for the natural language
processing area, is good news for our scheme's resilience). In addition to
the above mentioned design and analysis, another contribution of this paper
is the description of the implementation of the scheme and of the
experimental data obtained.}
}
@INPROCEEDINGS{Topkara2006b,
author = {Mercan Topkara and Umut Topkara and Mikhail J. Atallah},
title = {Words Are Not Enough: Sentence Level Natural Language
Watermarking},
booktitle = {Proceedings of the ACM Workshop on Content Protection and
Security (in conjuction with ACM Multimedia)},
year = {2006},
month = {October},
location = {Santa Barbara, CA}
}
@INPROCEEDINGS{Topkara2007,
author = {Mercan Topkara and Umut Topkara and Mikhail J. Atallah},
title = {Information hiding through errors: A confusing approach},
booktitle = {Proceedings of the SPIE International Conference on Security,
Steganography, and Watermarking of Multimedia Contents},
year = {2007},
month = {January},
location = {San Jose, CA}
}