Tesseract
3.02
Main Page
Related Pages
Modules
Namespaces
Classes
Files
File List
File Members
All
Classes
Namespaces
Files
Functions
Variables
Typedefs
Enumerations
Enumerator
Friends
Macros
Groups
Pages
unicodes.cpp
Go to the documentation of this file.
1
/**********************************************************************
2
* File: unicodes.h
3
* Description: Unicode related machinery
4
* Author: David Eger
5
* Created: Wed Jun 15 16:37:50 PST 2011
6
*
7
* (C) Copyright 2011, Google, Inc.
8
** Licensed under the Apache License, Version 2.0 (the "License");
9
** you may not use this file except in compliance with the License.
10
** You may obtain a copy of the License at
11
** http://www.apache.org/licenses/LICENSE-2.0
12
** Unless required by applicable law or agreed to in writing, software
13
** distributed under the License is distributed on an "AS IS" BASIS,
14
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
** See the License for the specific language governing permissions and
16
** limitations under the License.
17
*
18
**********************************************************************/
19
20
#include "
unicodes.h
"
21
#include "
host.h
"
// for NULL
22
23
namespace
tesseract
{
24
25
const
char
*
kUTF8LineSeparator
=
"\u2028"
;
// "\xe2\x80\xa8";
26
const
char
*
kUTF8ParagraphSeparator
=
"\u2029"
;
// "\xe2\x80\xa9";
27
const
char
*
kLRM
=
"\u200E"
;
// Left-to-Right Mark
28
const
char
*
kRLM
=
"\u200F"
;
// Right-to-Left Mark
29
const
char
*
kRLE
=
"\u202A"
;
// Right-to-Left Embedding
30
const
char
*
kPDF
=
"\u202C"
;
// Pop Directional Formatting
31
32
const
char
*
kHyphenLikeUTF8
[] = {
33
"-"
,
// ASCII hyphen-minus
34
"\u05BE"
,
// word hyphen in hybrew
35
"\u2010"
,
// hyphen
36
"\u2011"
,
// non-breaking hyphen
37
"\u2012"
,
// a hyphen the same width as digits
38
"\u2013"
,
// en dash
39
"\u2014"
,
// em dash
40
"\u2015"
,
// horizontal bar
41
"\u2212"
,
// arithmetic minus sign
42
"\uFE58"
,
// small em dash
43
"\uFE63"
,
// small hyphen-minus
44
"\uFF0D"
,
// fullwidth hyphen-minus
45
NULL
,
// end of our list
46
};
47
48
const
char
*
kApostropheLikeUTF8
[] = {
49
"'"
,
// ASCII apostrophe
50
"`"
,
// ASCII backtick
51
"\u2018"
,
// opening single quote
52
"\u2019"
,
// closing single quote
53
"\u2032"
,
// mathematical prime mark
54
NULL
,
// end of our list.
55
};
56
57
}
// namespace
mnt
data
src
tesseract-ocr
ccutil
unicodes.cpp
Generated on Thu Nov 1 2012 20:19:46 for Tesseract by
1.8.1