VTK
dox/TextAnalysis/vtkTokenizer.h
Go to the documentation of this file.
00001 /*=========================================================================
00002 
00003   Program:   Visualization Toolkit
00004   Module:    vtkTokenizer.h
00005 
00006   Copyright (c) Ken Martin, Will Schroeder, Bill Lorensen
00007   All rights reserved.
00008   See Copyright.txt or http://www.kitware.com/Copyright.htm for details.
00009 
00010      This software is distributed WITHOUT ANY WARRANTY; without even
00011      the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
00012      PURPOSE.  See the above copyright notice for more information.
00013 
00014 =========================================================================*/
00015 /*-------------------------------------------------------------------------
00016   Copyright 2008 Sandia Corporation.
00017   Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
00018   the U.S. Government retains certain rights in this software.
00019 -------------------------------------------------------------------------*/
00020 
00081 #ifndef __vtkTokenizer_h
00082 #define __vtkTokenizer_h
00083 
00084 #include <vtkTableAlgorithm.h>
00085 #include <vtkUnicodeString.h> //Needed for delimiter specification
00086 
00087 class VTK_TEXT_ANALYSIS_EXPORT vtkTokenizer :
00088   public vtkTableAlgorithm
00089 {
00090 public:
00091   static vtkTokenizer* New();
00092   vtkTypeMacro(vtkTokenizer, vtkTableAlgorithm);
00093   void PrintSelf(ostream& os, vtkIndent indent);
00094 
00095 //BTX
00097 
00099   typedef vtkstd::pair<vtkUnicodeString::value_type, vtkUnicodeString::value_type> DelimiterRange;
00100   // Description:
00101   // Defines storage for a collection of half-open ranges of Unicode characters.
00102   typedef vtkstd::vector<DelimiterRange> DelimiterRanges;
00104 
00106 
00108   static const DelimiterRanges Punctuation();
00109   // Description:
00110   // Returns a set of delimiter ranges that match Unicode whitespace codepoints.
00111   static const DelimiterRanges Whitespace();
00112   // Description:
00113   // Returns a set of delimiter ranges that match logosyllabic languages where characters represent
00114   // words instead of sounds, such as Chinese, Japanese, and Korean.
00115   static const DelimiterRanges Logosyllabic();
00117 
00119 
00121   void AddDroppedDelimiters(vtkUnicodeString::value_type begin, vtkUnicodeString::value_type end);
00122   // Description:
00123   // Adds a collection of delimiter ranges to the set of "dropped" delimiters.
00124   void AddDroppedDelimiters(const DelimiterRanges& ranges);
00126 
00128 
00130   void AddKeptDelimiters(vtkUnicodeString::value_type begin, vtkUnicodeString::value_type end);
00131   // Description:
00132   // Adds a collection of delimiter ranges to the set of "kept" delimiters.
00133   void AddKeptDelimiters(const DelimiterRanges& ranges);
00134 //ETX
00136 
00138 
00142   void DropPunctuation();
00143   void DropWhitespace();
00144   void KeepPunctuation();
00145   void KeepWhitespace();
00146   void KeepLogosyllabic();
00148 
00150 
00151   void ClearDroppedDelimiters();
00152   // Description:
00153   // Clears the set of "kept" delimiters.
00154   void ClearKeptDelimiters();
00156 
00157 //BTX
00158 protected:
00159   vtkTokenizer();
00160   ~vtkTokenizer();
00161 
00162   int FillInputPortInformation(int port, vtkInformation* info);
00163 
00164   virtual int RequestData(
00165     vtkInformation* request,
00166     vtkInformationVector** inputVector,
00167     vtkInformationVector* outputVector);
00168 
00169 private:
00170   vtkTokenizer(const vtkTokenizer &); // Not implemented.
00171   void operator=(const vtkTokenizer &); // Not implemented.
00172 
00173   class Internals;
00174   Internals* const Implementation;
00175 //ETX
00176 };
00177 
00178 #endif // __vtkTokenizer_h
00179