mupdf
Loading...
Searching...
No Matches
structured-text.h File Reference
Include dependency graph for structured-text.h:
This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Classes

struct  fz_layout_char
struct  fz_layout_line
struct  fz_layout_block
struct  fz_stext_page_details
struct  fz_stext_page
struct  fz_stext_grid_info
struct  fz_stext_block
struct  fz_stext_line
struct  fz_stext_char
struct  fz_stext_struct
struct  fz_stext_grid_divider
struct  fz_stext_grid_positions
struct  fz_stext_options
struct  fz_search_quad
struct  fz_stext_position
struct  fz_search_match
struct  fz_search_result
struct  fz_stext_page_block_iterator
struct  fz_image_raft_options

Typedefs

typedef struct fz_layout_char fz_layout_char
typedef struct fz_layout_line fz_layout_line
typedef struct fz_stext_char fz_stext_char
typedef struct fz_stext_line fz_stext_line
typedef struct fz_stext_block fz_stext_block
typedef struct fz_stext_struct fz_stext_struct
typedef struct fz_stext_grid_positions fz_stext_grid_positions
typedef int fz_search_callback_fn(fz_context *ctx, void *opaque, int num_quads, fz_quad *hit_bbox)
typedef int fz_match_callback_fn(fz_context *ctx, void *opaque, int num_quads, fz_quad *hit_bbox, int chapter, int page)
typedef struct fz_search fz_search
typedef struct fz_flotilla fz_flotilla

Enumerations

enum  {
  FZ_STEXT_PRESERVE_LIGATURES = 1 , FZ_STEXT_PRESERVE_WHITESPACE = 2 , FZ_STEXT_PRESERVE_IMAGES = 4 , FZ_STEXT_INHIBIT_SPACES = 8 ,
  FZ_STEXT_DEHYPHENATE = 16 , FZ_STEXT_PRESERVE_SPANS = 32 , FZ_STEXT_CLIP = 64 , FZ_STEXT_USE_CID_FOR_UNKNOWN_UNICODE = 128 ,
  FZ_STEXT_COLLECT_STRUCTURE = 256 , FZ_STEXT_ACCURATE_BBOXES = 512 , FZ_STEXT_COLLECT_VECTORS = 1024 , FZ_STEXT_IGNORE_ACTUALTEXT = 2048 ,
  FZ_STEXT_SEGMENT = 4096 , FZ_STEXT_PARAGRAPH_BREAK = 8192 , FZ_STEXT_TABLE_HUNT = 16384 , FZ_STEXT_COLLECT_STYLES = 32768 ,
  FZ_STEXT_USE_GID_FOR_UNKNOWN_UNICODE = 65536 , FZ_STEXT_CLIP_RECT = (1<<17) , FZ_STEXT_ACCURATE_ASCENDERS = (1<<18) , FZ_STEXT_ACCURATE_SIDE_BEARINGS = (1<<19) ,
  FZ_STEXT_LAZY_VECTORS = (1<<20) , FZ_STEXT_FUZZY_VECTORS = (1<<21) , FZ_STEXT_MEDIABOX_CLIP = FZ_STEXT_CLIP
}
enum  {
  FZ_STEXT_BLOCK_TEXT = 0 , FZ_STEXT_BLOCK_IMAGE = 1 , FZ_STEXT_BLOCK_STRUCT = 2 , FZ_STEXT_BLOCK_VECTOR = 3 ,
  FZ_STEXT_BLOCK_GRID = 4
}
enum  {
  FZ_STEXT_TEXT_JUSTIFY_UNKNOWN = 0 , FZ_STEXT_TEXT_JUSTIFY_LEFT = 1 , FZ_STEXT_TEXT_JUSTIFY_CENTER = 2 , FZ_STEXT_TEXT_JUSTIFY_RIGHT = 3 ,
  FZ_STEXT_TEXT_JUSTIFY_FULL = 4
}
enum  { FZ_STEXT_VECTOR_IS_STROKED = 1 , FZ_STEXT_VECTOR_IS_RECTANGLE = 2 , FZ_STEXT_VECTOR_CONTINUES = 4 }
enum  {
  FZ_STEXT_GRID_H_CROSSED = 1 , FZ_STEXT_GRID_V_CROSSED = 2 , FZ_STEXT_GRID_L_BORDER = 4 , FZ_STEXT_GRID_T_BORDER = 8 ,
  FZ_STEXT_GRID_FULL = 16
}
enum  fz_stext_line_flags { FZ_STEXT_LINE_FLAGS_JOINED = 1 }
enum  {
  FZ_STEXT_STRIKEOUT = 1 , FZ_STEXT_UNDERLINE = 2 , FZ_STEXT_SYNTHETIC = 4 , FZ_STEXT_BOLD = 8 ,
  FZ_STEXT_FILLED = 16 , FZ_STEXT_STROKED = 32 , FZ_STEXT_CLIPPED = 64 , FZ_STEXT_UNICODE_IS_CID = 128 ,
  FZ_STEXT_UNICODE_IS_GID = 256 , FZ_STEXT_SYNTHETIC_LARGE = 512 , FZ_STEXT_HIGHLIGHT = 1024
}
enum  fz_stext_xml_flags { FZ_STEXT_XML_FLAGS_CHARS = 1 , FZ_STEXT_XML_FLAGS_POINTERS = 2 }
enum  { FZ_SELECT_CHARS , FZ_SELECT_WORDS , FZ_SELECT_LINES }
enum  fz_search_options {
  FZ_SEARCH_EXACT = 0 , FZ_SEARCH_IGNORE_CASE = 1 , FZ_SEARCH_IGNORE_DIACRITICS = 2 , FZ_SEARCH_REGEXP = 4 ,
  FZ_SEARCH_KEEP_LINES = 8 , FZ_SEARCH_KEEP_PARAGRAPHS = 16 , FZ_SEARCH_KEEP_HYPHENS = 32
}
enum  fz_search_reason { FZ_SEARCH_MORE_INPUT = 0 , FZ_SEARCH_MATCH = 1 , FZ_SEARCH_COMPLETE }

Functions

fz_layout_blockfz_new_layout (fz_context *ctx)
void fz_drop_layout (fz_context *ctx, fz_layout_block *block)
void fz_add_layout_line (fz_context *ctx, fz_layout_block *block, float x, float y, float h, const char *p)
void fz_add_layout_char (fz_context *ctx, fz_layout_block *block, float x, float w, const char *p)
fz_stext_pagefz_keep_stext_page (fz_context *ctx, fz_stext_page *page)
fz_stext_page_detailsfz_stext_page_details_for_block (fz_context *ctx, fz_stext_page *page, fz_stext_block *block)
fz_stext_pagefz_new_stext_page (fz_context *ctx, fz_rect mediabox)
void fz_drop_stext_page (fz_context *ctx, fz_stext_page *page)
void fz_print_stext_page_as_html (fz_context *ctx, fz_output *out, fz_stext_page *page, int id)
void fz_print_stext_header_as_html (fz_context *ctx, fz_output *out)
void fz_print_stext_trailer_as_html (fz_context *ctx, fz_output *out)
void fz_print_stext_page_as_xhtml (fz_context *ctx, fz_output *out, fz_stext_page *page, int id)
void fz_print_stext_header_as_xhtml (fz_context *ctx, fz_output *out)
void fz_print_stext_trailer_as_xhtml (fz_context *ctx, fz_output *out)
void fz_print_stext_page_as_xml (fz_context *ctx, fz_output *out, fz_stext_page *page, int id)
void fz_print_stext_page_as_xml_with_flags (fz_context *ctx, fz_output *out, fz_stext_page *page, int id, fz_stext_xml_flags flags)
void fz_debug_stext_page (fz_context *ctx, fz_stext_page *page, int id)
void fz_print_stext_page_as_json (fz_context *ctx, fz_output *out, fz_stext_page *page, float scale)
void fz_print_stext_page_as_text (fz_context *ctx, fz_output *out, fz_stext_page *page)
int fz_search_stext_page (fz_context *ctx, fz_stext_page *text, const char *needle, int *hit_mark, fz_quad *hit_bbox, int hit_max)
int fz_search_stext_page_cb (fz_context *ctx, fz_stext_page *text, const char *needle, fz_search_callback_fn *cb, void *opaque)
int fz_highlight_selection (fz_context *ctx, fz_stext_page *page, fz_point a, fz_point b, fz_quad *quads, int max_quads)
fz_quad fz_snap_selection (fz_context *ctx, fz_stext_page *page, fz_point *ap, fz_point *bp, int mode)
char * fz_copy_selection (fz_context *ctx, fz_stext_page *page, fz_point a, fz_point b, int crlf)
char * fz_copy_rectangle (fz_context *ctx, fz_stext_page *page, fz_rect area, int crlf)
void fz_init_stext_options (fz_context *ctx, fz_stext_options *opts)
fz_stext_optionsfz_parse_stext_options (fz_context *ctx, fz_stext_options *opts, const char *string)
void fz_apply_stext_options (fz_context *ctx, fz_stext_options *opts, fz_options *options)
int fz_segment_stext_page (fz_context *ctx, fz_stext_page *page)
int fz_segment_stext_rect (fz_context *ctx, fz_stext_page *page, fz_rect rect)
void fz_paragraph_break (fz_context *ctx, fz_stext_page *page)
void fz_table_hunt (fz_context *ctx, fz_stext_page *page)
void fz_table_hunt_within_bounds (fz_context *ctx, fz_stext_page *page, fz_rect bounds)
fz_stext_blockfz_find_table_within_bounds (fz_context *ctx, fz_stext_page *page, fz_rect bounds)
fz_stext_blockfz_find_table_within_grid (fz_context *ctx, fz_stext_page *page, fz_stext_grid_positions *xpos, fz_stext_grid_positions *ypos, float limit)
int fz_propose_table_within_bounds (fz_context *ctx, fz_stext_page *page, fz_rect bounds, fz_stext_grid_positions **xposp, fz_stext_grid_positions **yposp)
fz_devicefz_new_stext_device (fz_context *ctx, fz_stext_page *page, const fz_stext_options *options)
fz_devicefz_new_stext_device_for_page (fz_context *ctx, fz_stext_page *stext_page, const fz_stext_options *opts, int chapter_num, int page_num, fz_rect mediabox)
fz_devicefz_new_ocr_device (fz_context *ctx, fz_device *target, fz_matrix ctm, fz_rect mediabox, int with_list, const char *language, const char *datadir, int(*progress)(fz_context *, void *, int), void *progress_arg)
fz_devicefz_new_ocr_device_with_options (fz_context *ctx, fz_device *target, fz_matrix ctm, fz_rect mediabox, int with_list, const char *language, const char *datadir, int(*progress)(fz_context *, void *, int), void *progress_arg, fz_options *options)
fz_documentfz_open_reflowed_document (fz_context *ctx, fz_document *underdoc, const fz_stext_options *opts)
int fz_is_unicode_space_equivalent (int c)
int fz_is_unicode_whitespace (int c)
int fz_is_unicode_hyphen (int c)
void fz_init_search_options (fz_context *ctx, fz_search_options *options)
fz_search_optionsfz_parse_search_options (fz_context *ctx, fz_search_options *options, const char *args)
void fz_apply_search_options (fz_context *ctx, fz_search_options *options, fz_options *opts)
fz_searchfz_new_search (fz_context *ctx, const char *needle, fz_search_options options)
fz_search_result fz_search_forwards (fz_context *ctx, fz_search *search)
fz_search_result fz_search_backwards (fz_context *ctx, fz_search *search)
void fz_feed_search (fz_context *ctx, fz_search *search, fz_stext_page *page, int seq)
void fz_drop_search (fz_context *ctx, fz_search *search)
int fz_match_stext_page (fz_context *ctx, fz_stext_page *text, const char *needle, int *hit_mark, fz_quad *hit_bbox, int hit_max, fz_search_options options)
int fz_match_stext_page_cb (fz_context *ctx, fz_stext_page *page, const char *needle, fz_match_callback_fn *cb, void *opaque, fz_search_options options)
fz_stext_blockfz_new_stext_struct (fz_context *ctx, fz_stext_page *page, fz_structure standard, const char *raw, int index)
fz_stext_page_block_iterator fz_stext_page_block_iterator_begin (fz_stext_page *page)
fz_stext_page_block_iterator fz_stext_page_block_iterator_begin_from (fz_stext_page *page, fz_stext_block *block, fz_stext_struct *top)
fz_stext_page_block_iterator fz_stext_page_block_iterator_begin_dfs (fz_stext_page *page)
fz_stext_page_block_iterator fz_stext_page_block_iterator_begin_from_dfs (fz_stext_page *page, fz_stext_block *block, fz_stext_struct *top)
fz_stext_page_block_iterator fz_stext_page_block_iterator_begin_rdfs (fz_stext_page *page)
fz_stext_page_block_iterator fz_stext_page_block_iterator_begin_from_rdfs (fz_stext_page *page, fz_stext_block *block, fz_stext_struct *top)
fz_stext_page_block_iterator fz_stext_page_block_iterator_next (fz_stext_page_block_iterator pos)
fz_stext_page_block_iterator fz_stext_page_block_iterator_down (fz_stext_page_block_iterator pos)
fz_stext_page_block_iterator fz_stext_page_block_iterator_up (fz_stext_page_block_iterator pos)
fz_stext_page_block_iterator fz_stext_page_block_iterator_next_dfs (fz_stext_page_block_iterator pos)
fz_stext_page_block_iterator fz_stext_page_block_iterator_next_rdfs (fz_stext_page_block_iterator pos)
int fz_stext_page_block_iterator_eod (fz_stext_page_block_iterator pos)
int fz_stext_page_block_iterator_eod_dfs (fz_stext_page_block_iterator pos)
int fz_stext_page_block_iterator_eod_rdfs (fz_stext_page_block_iterator pos)
void fz_classify_stext_rect (fz_context *ctx, fz_stext_page *page, fz_structure classification, fz_rect rect)
int fz_stext_remove_page_fill (fz_context *ctx, fz_stext_page *page)
void fz_stext_raft_images (fz_context *ctx, fz_stext_page *stext, fz_image_raft_options *options)
fz_flotillafz_new_flotilla_from_stext_page_vectors (fz_context *ctx, fz_stext_page *page)
void fz_drop_flotilla (fz_context *ctx, fz_flotilla *f)
int fz_flotilla_size (fz_context *ctx, fz_flotilla *flot)
fz_rect fz_flotilla_raft_area (fz_context *ctx, fz_flotilla *flot, int i)
void fz_verify_stext_page (fz_context *ctx, fz_stext_page *page, const char *title)

Variables

FZ_DATA const char * fz_stext_options_usage
FZ_DATA const char * fz_search_options_usage

Typedef Documentation

◆ fz_flotilla

typedef struct fz_flotilla fz_flotilla

◆ fz_layout_char

typedef struct fz_layout_char fz_layout_char

Simple text layout (for use with annotation editing primarily).

◆ fz_layout_line

typedef struct fz_layout_line fz_layout_line

◆ fz_match_callback_fn

typedef int fz_match_callback_fn(fz_context *ctx, void *opaque, int num_quads, fz_quad *hit_bbox, int chapter, int page)

Callback function for use in searching.

Called with the list of quads that correspond to a single hit.

The callback should return with 0 to continue the search, or 1 to abort it. All other values are reserved at this point.

◆ fz_search

typedef struct fz_search fz_search

◆ fz_search_callback_fn

typedef int fz_search_callback_fn(fz_context *ctx, void *opaque, int num_quads, fz_quad *hit_bbox)

Callback function for use in searching.

Called with the list of quads that correspond to a single hit.

The callback should return with 0 to continue the search, or 1 to abort it. All other values are reserved at this point.

◆ fz_stext_block

typedef struct fz_stext_block fz_stext_block

◆ fz_stext_char

typedef struct fz_stext_char fz_stext_char

Text extraction device: Used for searching, format conversion etc.

(In development - Subject to change in future versions)

◆ fz_stext_grid_positions

typedef struct fz_stext_grid_positions fz_stext_grid_positions

◆ fz_stext_line

typedef struct fz_stext_line fz_stext_line

◆ fz_stext_struct

typedef struct fz_stext_struct fz_stext_struct

Enumeration Type Documentation

◆ anonymous enum

anonymous enum

FZ_STEXT_PRESERVE_LIGATURES: If this option is activated ligatures are passed through to the application in their original form. If this option is deactivated ligatures are expanded into their constituent parts, e.g. the ligature ffi is expanded into three separate characters f, f and i.

FZ_STEXT_PRESERVE_WHITESPACE: If this option is activated whitespace is passed through to the application in its original form. If this option is deactivated any type of horizontal whitespace (including horizontal tabs) will be replaced with space characters of variable width.

FZ_STEXT_PRESERVE_IMAGES: If this option is set, then images will be stored in the structured text structure. The default is to ignore all images.

FZ_STEXT_INHIBIT_SPACES: If this option is set, we will not try to add missing space characters where there are large gaps between characters.

FZ_STEXT_DEHYPHENATE: If this option is set, hyphens at the end of a line will be recorded as being soft-hyphens; when flattened soft-hyphens at the end of lines will cause the lines to be joined.

FZ_STEXT_PRESERVE_SPANS: If this option is set, spans on the same line will not be merged. Each line will thus be a span of text with the same font, colour, and size.

FZ_STEXT_CLIP: If this option is set, characters that would be entirely clipped away by the current clipping path (or, more accurate, the smallest bbox that contains the current clipping path) will be ignored. The bboxes of images will be similarly reduced in size. The clip path is guaranteed to be smaller then the page mediabox, hence this option subsumes an older, now deprecated, FZ_STEXT_MEDIABOX_CLIP option.

FZ_STEXT_CLIP_RECT: If this option is set, characters that would be entirely clipped away by the specified 'clip' rectangle in the options struct will be ignored. This enables content from specific subsections of pages to be extracted.

FZ_STEXT_COLLECT_STRUCTURE: If this option is set, we will collect the structure as specified using begin/end_structure calls. This will change the returned stext structure from being a simple list of blocks into effectively being a 'tree' that should be walked in depth-first order.

FZ_STEXT_COLLECT_VECTORS: If this option is set, we will collect details (currently just the bbox) of vector graphics. This is intended to be of use in segmentation analysis.

FZ_STEXT_LAZY_VECTORS: If this option is set, we will defer collected vectors to the end of the text run they appear in. This prevents vector drawn strikeouts, or diacritics/accents/marks from breaking the flow of text.

FZ_STEXT_FUZZY_VECTORS: If this option is set, we 'fuzzily' collect rectangular vectors of the same colour together. This enables us to spot where 'pixels' or 'slices' of vectors are used to create the appearance of characters on the page without exploding the storage and processing time requirements.

FZ_STEXT_IGNORE_ACTUALTEXT: If this option is set, we will no longer replace text by the ActualText replacement specified in the document.

FZ_STEXT_SEGMENT: If this option is set, we will attempt to segment the page into different regions. This will deliberately not do anything to pages with structure information present.

FZ_STEXT_PARAGRAPH_BREAK: If this option is set, we will break blocks of text at what appear to be paragraph boundaries. This only works for left-to-right, top-to-bottom paragraphs. Works best on a segmented page.

FZ_STEXT_TABLE_HUNT: If this option is set, we will hunt for tables within the stext. Details of the potential tables found will be inserted into the stext for the caller to interpret. This will work best on a segmented page.

FZ_STEXT_USE_CID_FOR_UNKNOWN_UNICODE: If this option is set, then in the event that we fail to find a unicode value for a given character, we we instead return its CID in the unicode field. We will set the FZ_STEXT_UNICODE_IS_CID bit in the char flags word to indicate that this has happened.

FZ_STEXT_USE_GID_FOR_UNKNOWN_UNICODE: If this option is set, then in the event that we fail to find a unicode value for a given character, we we instead return its glyph in the unicode field. We will set the FZ_STEXT_UNICODE_IS_GID bit in the char flags word to indicate that this has happened.

Setting both FZ_STEXT_USE_CID_FOR_UNKNOWN_UNICODE and FZ_STEXT_USE_GID_FOR_UNKNOWN_UNICODE will give undefined behaviour.

Enumerator
FZ_STEXT_PRESERVE_LIGATURES 
FZ_STEXT_PRESERVE_WHITESPACE 
FZ_STEXT_PRESERVE_IMAGES 
FZ_STEXT_INHIBIT_SPACES 
FZ_STEXT_DEHYPHENATE 
FZ_STEXT_PRESERVE_SPANS 
FZ_STEXT_CLIP 
FZ_STEXT_USE_CID_FOR_UNKNOWN_UNICODE 
FZ_STEXT_COLLECT_STRUCTURE 
FZ_STEXT_ACCURATE_BBOXES 
FZ_STEXT_COLLECT_VECTORS 
FZ_STEXT_IGNORE_ACTUALTEXT 
FZ_STEXT_SEGMENT 
FZ_STEXT_PARAGRAPH_BREAK 
FZ_STEXT_TABLE_HUNT 
FZ_STEXT_COLLECT_STYLES 
FZ_STEXT_USE_GID_FOR_UNKNOWN_UNICODE 
FZ_STEXT_CLIP_RECT 
FZ_STEXT_ACCURATE_ASCENDERS 
FZ_STEXT_ACCURATE_SIDE_BEARINGS 
FZ_STEXT_LAZY_VECTORS 
FZ_STEXT_FUZZY_VECTORS 
FZ_STEXT_MEDIABOX_CLIP 

◆ anonymous enum

anonymous enum
Enumerator
FZ_STEXT_VECTOR_IS_STROKED 
FZ_STEXT_VECTOR_IS_RECTANGLE 
FZ_STEXT_VECTOR_CONTINUES 

◆ anonymous enum

anonymous enum
Enumerator
FZ_STEXT_BLOCK_TEXT 
FZ_STEXT_BLOCK_IMAGE 
FZ_STEXT_BLOCK_STRUCT 
FZ_STEXT_BLOCK_VECTOR 
FZ_STEXT_BLOCK_GRID 

◆ anonymous enum

anonymous enum
Enumerator
FZ_STEXT_GRID_H_CROSSED 
FZ_STEXT_GRID_V_CROSSED 
FZ_STEXT_GRID_L_BORDER 
FZ_STEXT_GRID_T_BORDER 
FZ_STEXT_GRID_FULL 

◆ anonymous enum

anonymous enum
Enumerator
FZ_STEXT_TEXT_JUSTIFY_UNKNOWN 
FZ_STEXT_TEXT_JUSTIFY_LEFT 
FZ_STEXT_TEXT_JUSTIFY_CENTER 
FZ_STEXT_TEXT_JUSTIFY_RIGHT 
FZ_STEXT_TEXT_JUSTIFY_FULL 

◆ anonymous enum

anonymous enum
Enumerator
FZ_STEXT_STRIKEOUT 
FZ_STEXT_UNDERLINE 
FZ_STEXT_SYNTHETIC 
FZ_STEXT_BOLD 
FZ_STEXT_FILLED 
FZ_STEXT_STROKED 
FZ_STEXT_CLIPPED 
FZ_STEXT_UNICODE_IS_CID 
FZ_STEXT_UNICODE_IS_GID 
FZ_STEXT_SYNTHETIC_LARGE 
FZ_STEXT_HIGHLIGHT 

◆ anonymous enum

anonymous enum
Enumerator
FZ_SELECT_CHARS 
FZ_SELECT_WORDS 
FZ_SELECT_LINES 

◆ fz_search_options

Enumerator
FZ_SEARCH_EXACT 
FZ_SEARCH_IGNORE_CASE 
FZ_SEARCH_IGNORE_DIACRITICS 
FZ_SEARCH_REGEXP 
FZ_SEARCH_KEEP_LINES 
FZ_SEARCH_KEEP_PARAGRAPHS 
FZ_SEARCH_KEEP_HYPHENS 

◆ fz_search_reason

Enumerator
FZ_SEARCH_MORE_INPUT 
FZ_SEARCH_MATCH 
FZ_SEARCH_COMPLETE 

◆ fz_stext_line_flags

Enumerator
FZ_STEXT_LINE_FLAGS_JOINED 

◆ fz_stext_xml_flags

Output structured text to a file in XML format, with flags to control how much of the structure is displayed.

Enumerator
FZ_STEXT_XML_FLAGS_CHARS 
FZ_STEXT_XML_FLAGS_POINTERS 

Function Documentation

◆ fz_add_layout_char()

void fz_add_layout_char ( fz_context * ctx,
fz_layout_block * block,
float x,
float w,
const char * p )

Add a new char to the line at the end of the layout block.

◆ fz_add_layout_line()

void fz_add_layout_line ( fz_context * ctx,
fz_layout_block * block,
float x,
float y,
float h,
const char * p )

Add a new line to the end of the layout block.

◆ fz_apply_search_options()

void fz_apply_search_options ( fz_context * ctx,
fz_search_options * options,
fz_options * opts )

◆ fz_apply_stext_options()

void fz_apply_stext_options ( fz_context * ctx,
fz_stext_options * opts,
fz_options * options )

Parse stext device options from an fz_options struct into an already initialised opts structure.

◆ fz_classify_stext_rect()

void fz_classify_stext_rect ( fz_context * ctx,
fz_stext_page * page,
fz_structure classification,
fz_rect rect )

◆ fz_copy_rectangle()

char * fz_copy_rectangle ( fz_context * ctx,
fz_stext_page * page,
fz_rect area,
int crlf )

Return a newly allocated UTF-8 string with the text for a given selection rectangle.

crlf: If true, write "\r\n" style line endings (otherwise "\n" only).

◆ fz_copy_selection()

char * fz_copy_selection ( fz_context * ctx,
fz_stext_page * page,
fz_point a,
fz_point b,
int crlf )

Return a newly allocated UTF-8 string with the text for a given selection.

crlf: If true, write "\r\n" style line endings (otherwise "\n" only).

◆ fz_debug_stext_page()

void fz_debug_stext_page ( fz_context * ctx,
fz_stext_page * page,
int id )

Convenience function to call the above.

◆ fz_drop_flotilla()

void fz_drop_flotilla ( fz_context * ctx,
fz_flotilla * f )

◆ fz_drop_layout()

void fz_drop_layout ( fz_context * ctx,
fz_layout_block * block )

Drop layout block. Free the pool, and linked blocks.

Never throws exceptions.

◆ fz_drop_search()

void fz_drop_search ( fz_context * ctx,
fz_search * search )

Free the search structures.

◆ fz_drop_stext_page()

void fz_drop_stext_page ( fz_context * ctx,
fz_stext_page * page )

◆ fz_feed_search()

void fz_feed_search ( fz_context * ctx,
fz_search * search,
fz_stext_page * page,
int seq )

Supply more stext to be searched; ownership of the stext page is passed in.

This can be called immediately after an fz_search has been created to give it the first page to search, or it will be requested as soon as the first search operation is done on that page.

If we are calling this in response to fz_search_forwards telling us that we need another page, page will be the stext for the next page.

If we are calling this in response to fz_search_backwards telling is that we need another page, page will be the stext for the previous page.

seq is a simple integer value that will be parrotted back to us in the match (typically the page number within the document).

The search function will retain the page for a while. When it has finished with it, it will call fz_drop_stext_page() to release it.

Pass page = NULL to indicate that there are no more pages (in this direction) to be fed.

◆ fz_find_table_within_bounds()

fz_stext_block * fz_find_table_within_bounds ( fz_context * ctx,
fz_stext_page * page,
fz_rect bounds )

Interpret the bounded contents of a given stext page as a table.

The page contents will be rewritten to contain a Table structure with the identified content in it.

This uses the same logic as for fz_table_hunt, without the actual hunting. fz_table_hunt hunts to find possible bounds for multiple tables on the page; this routine just finds a single table contained within the given rectangle.

Returns the stext_block list that contains the content of the table.

◆ fz_find_table_within_grid()

fz_stext_block * fz_find_table_within_grid ( fz_context * ctx,
fz_stext_page * page,
fz_stext_grid_positions * xpos,
fz_stext_grid_positions * ypos,
float limit )

Interpret the contents of a given stext page that fall within a given grid as a table.

The page contents will be rewritten to contain a Table structure with the identified content in it.

This uses the same logic as for fz_table_hunt, without the actual hunting, and the grid detection phase. fz_table_hunt hunts to find possible bounds for multiple tables on the page; this routine just finds a single table contained within the given rectangle. The grid detection phase is skipped, and we just use the grid as given to us. We still perform the cell analysis stage though, so the grid can be refined.

Returns the stext_block list that contains the content of the table, or NULL if no table is found that scores below limit.

◆ fz_flotilla_raft_area()

fz_rect fz_flotilla_raft_area ( fz_context * ctx,
fz_flotilla * flot,
int i )

◆ fz_flotilla_size()

int fz_flotilla_size ( fz_context * ctx,
fz_flotilla * flot )

◆ fz_highlight_selection()

int fz_highlight_selection ( fz_context * ctx,
fz_stext_page * page,
fz_point a,
fz_point b,
fz_quad * quads,
int max_quads )

Return a list of quads to highlight lines inside the selection points.

◆ fz_init_search_options()

void fz_init_search_options ( fz_context * ctx,
fz_search_options * options )

◆ fz_init_stext_options()

void fz_init_stext_options ( fz_context * ctx,
fz_stext_options * opts )

◆ fz_is_unicode_hyphen()

int fz_is_unicode_hyphen ( int c)

Simple function to return if a given unicode char is a hyphen.

◆ fz_is_unicode_space_equivalent()

int fz_is_unicode_space_equivalent ( int c)

Simple function to return if a given unicode char is equivalent to a space.

◆ fz_is_unicode_whitespace()

int fz_is_unicode_whitespace ( int c)

Simple function to return if a given unicode char is whitespace.

◆ fz_keep_stext_page()

fz_stext_page * fz_keep_stext_page ( fz_context * ctx,
fz_stext_page * page )

Take a new reference to an fz_stext_page.

◆ fz_match_stext_page()

int fz_match_stext_page ( fz_context * ctx,
fz_stext_page * text,
const char * needle,
int * hit_mark,
fz_quad * hit_bbox,
int hit_max,
fz_search_options options )

Search for occurrence of 'needle' in text page, matching in a given style.

Return the number of quads and store hit quads in the passed in array.

NOTE: This is an experimental interface and subject to change without notice.

◆ fz_match_stext_page_cb()

int fz_match_stext_page_cb ( fz_context * ctx,
fz_stext_page * page,
const char * needle,
fz_match_callback_fn * cb,
void * opaque,
fz_search_options options )

Search for occurrence of 'needle' in text page.

Call callback once for each hit. This callback will receive (potentially) multiple quads for each hit.

Returns the number of hits - note that this is potentially different from (i.e. is not greater than) the number of quads as returned by the non callback API.

NOTE: This is an experimental interface and subject to change without notice.

◆ fz_new_flotilla_from_stext_page_vectors()

fz_flotilla * fz_new_flotilla_from_stext_page_vectors ( fz_context * ctx,
fz_stext_page * page )

◆ fz_new_layout()

fz_layout_block * fz_new_layout ( fz_context * ctx)

Create a new layout block, with new allocation pool, zero matrices, and initialise linked pointers.

◆ fz_new_ocr_device()

fz_device * fz_new_ocr_device ( fz_context * ctx,
fz_device * target,
fz_matrix ctm,
fz_rect mediabox,
int with_list,
const char * language,
const char * datadir,
int(* progress )(fz_context *, void *, int),
void * progress_arg )

Create a device to OCR the text on the page.

Renders the page internally to a bitmap that is then OCRd. Text is then forwarded onto the target device.

target: The target device to receive the OCRd text.

ctm: The transform to apply to the mediabox to get the size for the rendered page image. Also used to calculate the resolution for the page image. In general, this will be the same as the CTM that you pass to fz_run_page (or fz_run_display_list) to feed this device.

mediabox: The mediabox (in points). Combined with the CTM to get the bounds of the pixmap used internally for the rendered page image.

with_list: If with_list is false, then all non-text operations are forwarded instantly to the target device. This results in the target device seeing all NON-text operations, followed by all the text operations (derived from OCR).

If with_list is true, then all the marking operations are collated into a display list which is then replayed to the target device at the end.

language: NULL (for "eng"), or a pointer to a string to describe the languages/scripts that should be used for OCR (e.g. "eng,ara").

datadir: NULL (for ""), or a pointer to a path string otherwise provided to Tesseract in the TESSDATA_PREFIX environment variable.

progress: NULL, or function to be called periodically to indicate progress. Return 0 to continue, or 1 to cancel. progress_arg is returned as the void *. The int is a value between 0 and 100 to indicate progress.

progress_arg: A void * value to be parrotted back to the progress function.

◆ fz_new_ocr_device_with_options()

fz_device * fz_new_ocr_device_with_options ( fz_context * ctx,
fz_device * target,
fz_matrix ctm,
fz_rect mediabox,
int with_list,
const char * language,
const char * datadir,
int(* progress )(fz_context *, void *, int),
void * progress_arg,
fz_options * options )

◆ fz_new_search()

fz_search * fz_new_search ( fz_context * ctx,
const char * needle,
fz_search_options options )

Create a new search.

If the needle is invalid (in the case of regexps, it fails to compile) it will throw an error.

◆ fz_new_stext_device()

fz_device * fz_new_stext_device ( fz_context * ctx,
fz_stext_page * page,
const fz_stext_options * options )

Create a device to extract the text on a page.

Gather the text on a page into blocks and lines.

The reading order is taken from the order the text is drawn in the source file, so may not be accurate.

page: The text page to which content should be added. This will usually be a newly created (empty) text page, but it can be one containing data already (for example when merging multiple pages, or watermarking).

options: Options to configure the stext device.

◆ fz_new_stext_device_for_page()

fz_device * fz_new_stext_device_for_page ( fz_context * ctx,
fz_stext_page * stext_page,
const fz_stext_options * opts,
int chapter_num,
int page_num,
fz_rect mediabox )

Create a device to extract the text on a page into an existing fz_stext_page structure.

Gather the text on a page into blocks and lines.

The reading order is taken from the order the text is drawn in the source file, so may not be accurate.

stext_page: The text page to which content should be added. This will usually be a newly created (empty) text page, but it can be one containing data already (for example when merging multiple pages, or watermarking).

options: Options to configure the stext device.

The next 2 parameters are copied into the fz_stext_page structure's ids section, so only have to be valid if you expect to interrogate that section later.

chapter_num: The chapter number that this page came from.

page_num: The page number that this page came from.

The final parameter is copied into the fz_stext_page structure's ids section. The mediabox for the enture fz_stext_page is unioned with this, so pass fz_empty_bbox if you don't care about getting a valid value back from the ids section, but you don't want to upset the value in the page->mediabox field.

mediabox: The mediabox for this page.

◆ fz_new_stext_page()

fz_stext_page * fz_new_stext_page ( fz_context * ctx,
fz_rect mediabox )

Create an empty text page.

The text page is filled out by the text device to contain the blocks and lines of text on the page.

mediabox: optional mediabox information.

◆ fz_new_stext_struct()

fz_stext_block * fz_new_stext_struct ( fz_context * ctx,
fz_stext_page * page,
fz_structure standard,
const char * raw,
int index )

◆ fz_open_reflowed_document()

fz_document * fz_open_reflowed_document ( fz_context * ctx,
fz_document * underdoc,
const fz_stext_options * opts )

◆ fz_paragraph_break()

void fz_paragraph_break ( fz_context * ctx,
fz_stext_page * page )

Attempt to break paragraphs at plausible places.

◆ fz_parse_search_options()

fz_search_options * fz_parse_search_options ( fz_context * ctx,
fz_search_options * options,
const char * args )

◆ fz_parse_stext_options()

fz_stext_options * fz_parse_stext_options ( fz_context * ctx,
fz_stext_options * opts,
const char * string )

Parse stext device options from a comma separated key-value string.

This initialises the opts structure.

◆ fz_print_stext_header_as_html()

void fz_print_stext_header_as_html ( fz_context * ctx,
fz_output * out )

◆ fz_print_stext_header_as_xhtml()

void fz_print_stext_header_as_xhtml ( fz_context * ctx,
fz_output * out )

◆ fz_print_stext_page_as_html()

void fz_print_stext_page_as_html ( fz_context * ctx,
fz_output * out,
fz_stext_page * page,
int id )

Output structured text to a file in HTML (visual) format.

◆ fz_print_stext_page_as_json()

void fz_print_stext_page_as_json ( fz_context * ctx,
fz_output * out,
fz_stext_page * page,
float scale )

Output structured text to a file in JSON format.

◆ fz_print_stext_page_as_text()

void fz_print_stext_page_as_text ( fz_context * ctx,
fz_output * out,
fz_stext_page * page )

Output structured text to a file in plain-text UTF-8 format.

◆ fz_print_stext_page_as_xhtml()

void fz_print_stext_page_as_xhtml ( fz_context * ctx,
fz_output * out,
fz_stext_page * page,
int id )

Output structured text to a file in XHTML (semantic) format.

◆ fz_print_stext_page_as_xml()

void fz_print_stext_page_as_xml ( fz_context * ctx,
fz_output * out,
fz_stext_page * page,
int id )

Output structured text to a file in XML format.

◆ fz_print_stext_page_as_xml_with_flags()

void fz_print_stext_page_as_xml_with_flags ( fz_context * ctx,
fz_output * out,
fz_stext_page * page,
int id,
fz_stext_xml_flags flags )

◆ fz_print_stext_trailer_as_html()

void fz_print_stext_trailer_as_html ( fz_context * ctx,
fz_output * out )

◆ fz_print_stext_trailer_as_xhtml()

void fz_print_stext_trailer_as_xhtml ( fz_context * ctx,
fz_output * out )

◆ fz_propose_table_within_bounds()

int fz_propose_table_within_bounds ( fz_context * ctx,
fz_stext_page * page,
fz_rect bounds,
fz_stext_grid_positions ** xposp,
fz_stext_grid_positions ** yposp )

Try to guess at the table structure within given bounds.

If no table can be found, we return 0. If we find one we return non-zero. (Currently, 1, other values reserved for the future.)

In the case of a non-zero return. xposp and ypos are returned as pointers to fz_stext_grid_positions records that must be freed.

◆ fz_search_backwards()

fz_search_result fz_search_backwards ( fz_context * ctx,
fz_search * search )

Continue searching backwards for the next match.

Will return asking for more stext, having matched, or having completed the search.

If it asks for more stext, then any further calls to this function will give the same result, until stext is supplied, or a NULL stext is fed in to indicate the end of the document.

Several pages may be requested before searching begins.

◆ fz_search_forwards()

fz_search_result fz_search_forwards ( fz_context * ctx,
fz_search * search )

Continue searching for the next match.

Will return with a search result.

If it asks for more stext, feed it with the requested page (or NULL to tell it it's the end of the document) before calling this again.

Several pages may be requested before searching begins.

◆ fz_search_stext_page()

int fz_search_stext_page ( fz_context * ctx,
fz_stext_page * text,
const char * needle,
int * hit_mark,
fz_quad * hit_bbox,
int hit_max )

Search for occurrence of 'needle' in text page. Case insensitive match.

Return the number of quads and store hit quads in the passed in array.

NOTE: This is an experimental interface and subject to change without notice.

◆ fz_search_stext_page_cb()

int fz_search_stext_page_cb ( fz_context * ctx,
fz_stext_page * text,
const char * needle,
fz_search_callback_fn * cb,
void * opaque )

Search for occurrence of 'needle' in text page.

Call callback once for each hit. This callback will receive (potentially) multiple quads for each hit.

Returns the number of hits - note that this is potentially different from (i.e. is not greater than) the number of quads as returned by the non callback API.

NOTE: This is an experimental interface and subject to change without notice.

◆ fz_segment_stext_page()

int fz_segment_stext_page ( fz_context * ctx,
fz_stext_page * page )

Perform segmentation analysis on an (unstructured) page to look for recursive subdivisions.

Essentially this code attempts to split the page horizontally and/or vertically repeatedly into smaller and smaller "segments" (divisions).

This minimises the reordering of the content, but some reordering may be unavoidable.

Returns 0 if no changes were made to the document.

This is experimental code, and may change (or be removed) in future versions!

◆ fz_segment_stext_rect()

int fz_segment_stext_rect ( fz_context * ctx,
fz_stext_page * page,
fz_rect rect )

Perform segmentation analysis on a rectangle of a given stext page.

Like fz_segment_stext_page, this attempts to split the given page region horizontally and/or vertically repeatedly into smaller and smaller "segments".

This works for pages with structure too, but splitting with rectangles that cut across structure blocks may not behave as expected.

This minimises the reordering of the content (as viewed from the perspective of a depth first traversal), but some reordering may be unavoidable.

This function accepts smaller gaps for segmentation than the full page segmentation does.

Returns 0 if no changes were made to the document.

This is experimental code, and may change (or be removed) in future versions!

◆ fz_snap_selection()

fz_quad fz_snap_selection ( fz_context * ctx,
fz_stext_page * page,
fz_point * ap,
fz_point * bp,
int mode )

◆ fz_stext_page_block_iterator_begin()

fz_stext_page_block_iterator fz_stext_page_block_iterator_begin ( fz_stext_page * page)

◆ fz_stext_page_block_iterator_begin_dfs()

fz_stext_page_block_iterator fz_stext_page_block_iterator_begin_dfs ( fz_stext_page * page)

◆ fz_stext_page_block_iterator_begin_from()

fz_stext_page_block_iterator fz_stext_page_block_iterator_begin_from ( fz_stext_page * page,
fz_stext_block * block,
fz_stext_struct * top )

◆ fz_stext_page_block_iterator_begin_from_dfs()

fz_stext_page_block_iterator fz_stext_page_block_iterator_begin_from_dfs ( fz_stext_page * page,
fz_stext_block * block,
fz_stext_struct * top )

◆ fz_stext_page_block_iterator_begin_from_rdfs()

fz_stext_page_block_iterator fz_stext_page_block_iterator_begin_from_rdfs ( fz_stext_page * page,
fz_stext_block * block,
fz_stext_struct * top )

◆ fz_stext_page_block_iterator_begin_rdfs()

fz_stext_page_block_iterator fz_stext_page_block_iterator_begin_rdfs ( fz_stext_page * page)

◆ fz_stext_page_block_iterator_down()

fz_stext_page_block_iterator fz_stext_page_block_iterator_down ( fz_stext_page_block_iterator pos)

◆ fz_stext_page_block_iterator_eod()

int fz_stext_page_block_iterator_eod ( fz_stext_page_block_iterator pos)

◆ fz_stext_page_block_iterator_eod_dfs()

int fz_stext_page_block_iterator_eod_dfs ( fz_stext_page_block_iterator pos)

◆ fz_stext_page_block_iterator_eod_rdfs()

int fz_stext_page_block_iterator_eod_rdfs ( fz_stext_page_block_iterator pos)

◆ fz_stext_page_block_iterator_next()

fz_stext_page_block_iterator fz_stext_page_block_iterator_next ( fz_stext_page_block_iterator pos)

◆ fz_stext_page_block_iterator_next_dfs()

fz_stext_page_block_iterator fz_stext_page_block_iterator_next_dfs ( fz_stext_page_block_iterator pos)

◆ fz_stext_page_block_iterator_next_rdfs()

fz_stext_page_block_iterator fz_stext_page_block_iterator_next_rdfs ( fz_stext_page_block_iterator pos)

◆ fz_stext_page_block_iterator_up()

fz_stext_page_block_iterator fz_stext_page_block_iterator_up ( fz_stext_page_block_iterator pos)

◆ fz_stext_page_details_for_block()

fz_stext_page_details * fz_stext_page_details_for_block ( fz_context * ctx,
fz_stext_page * page,
fz_stext_block * block )

Helper function to retrieve the details for a given id from a block.

◆ fz_stext_raft_images()

void fz_stext_raft_images ( fz_context * ctx,
fz_stext_page * stext,
fz_image_raft_options * options )

◆ fz_stext_remove_page_fill()

int fz_stext_remove_page_fill ( fz_context * ctx,
fz_stext_page * page )

◆ fz_table_hunt()

void fz_table_hunt ( fz_context * ctx,
fz_stext_page * page )

Hunt for possible tables on a page, and update the stext with information.

◆ fz_table_hunt_within_bounds()

void fz_table_hunt_within_bounds ( fz_context * ctx,
fz_stext_page * page,
fz_rect bounds )

Hunt for possible tables within a specific rect on a page, and update the stext with information.

◆ fz_verify_stext_page()

void fz_verify_stext_page ( fz_context * ctx,
fz_stext_page * page,
const char * title )

Variable Documentation

◆ fz_search_options_usage

FZ_DATA const char* fz_search_options_usage
extern

◆ fz_stext_options_usage

FZ_DATA const char* fz_stext_options_usage
extern