View Javadoc
1   /**
2    * This file Copyright (c) 2015 Magnolia International
3    * Ltd.  (http://www.magnolia-cms.com). All rights reserved.
4    *
5    *
6    * This file is dual-licensed under both the Magnolia
7    * Network Agreement and the GNU General Public License.
8    * You may elect to use one or the other of these licenses.
9    *
10   * This file is distributed in the hope that it will be
11   * useful, but AS-IS and WITHOUT ANY WARRANTY; without even the
12   * implied warranty of MERCHANTABILITY or FITNESS FOR A
13   * PARTICULAR PURPOSE, TITLE, or NONINFRINGEMENT.
14   * Redistribution, except as permitted by whichever of the GPL
15   * or MNA you select, is prohibited.
16   *
17   * 1. For the GPL license (GPL), you can redistribute and/or
18   * modify this file under the terms of the GNU General
19   * Public License, Version 3, as published by the Free Software
20   * Foundation.  You should have received a copy of the GNU
21   * General Public License, Version 3 along with this program;
22   * if not, write to the Free Software Foundation, Inc., 51
23   * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
24   *
25   * 2. For the Magnolia Network Agreement (MNA), this file
26   * and the accompanying materials are made available under the
27   * terms of the MNA which accompanies this distribution, and
28   * is available at http://www.magnolia-cms.com/mna.html
29   *
30   * Any modifications to this file must keep this entire header
31   * intact.
32   *
33   */
34  package info.magnolia.jackrabbit.lucene;
35  
36  import java.io.IOException;
37  import java.util.Set;
38  
39  import org.apache.jackrabbit.core.query.lucene.AbstractExcerpt;
40  import org.apache.jackrabbit.core.query.lucene.DefaultHighlighter;
41  import org.apache.lucene.index.Term;
42  import org.apache.lucene.index.TermPositionVector;
43  
44  /**
45   * Provides an HTML excerpt highlighting the searched term. By default it will strip all HTML tags and jcr identifiers which might have been indexed.
46   * Suppose you were searching for "foo" and the text found containing it is something like
47   *
48   * <pre>
49   * &lt;p&gt;This is an "excerpt" &lt;em&gt;highlighting&lt;/em&gt; the word foo deadbeef-face-babe-cafe-babecafebabe...
50   * </pre>
51   *
52   * it produces a result like the following which can then be used e.g. in a search result page.
53   *
54   * <pre>
55   * &lt;div class="excerpt"&gt;
56   *  &lt;span class="excerpt-fragment"&gt;This is an excerpt highlighting the word &lt;strong&gt;foo&lt;/strong&gt;...&lt;/span&gt;
57   * &lt;div&gt;
58   *
59   * </pre>
60   */
61  public class SearchHTMLExcerpt extends AbstractExcerpt {
62  
63      private static final String UUID_REGEX = "(jcr:)?[a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12}";
64  
65      private final NoXMLEscapeHighlighter highlighter = new NoXMLEscapeHighlighter();
66  
67      @Override
68      protected String createExcerpt(TermPositionVector tpv, String text, int maxFragments, int maxFragmentSize) throws IOException {
69  
70          String excerpt = highlighter.doHighlight(tpv, getQueryTerms(), text,
71                  "<div class=\"excerpt\">", "</div>", "<span class=\"excerpt-fragment\">", "</span>", "<strong>", "</strong>",
72                  maxFragments, maxFragmentSize / 2);
73  
74          // need to strip ids after the highlighting else the latter, for some reason, doesn't match query terms correctly.
75          return excerpt.replaceAll(UUID_REGEX, "");
76      }
77  
78      /**
79       * Unlike {@link DefaultHighlighter} this one does not XML escape its string input.
80       * The XML escaping done by DefaultHighlighter escapes HTML tags and quotes (") with HTML entities, e.g &lt;p&gt; becomes &amplt;p&ampgt;
81       * which is ugly to see e.g. in a search result page.
82       */
83      private class NoXMLEscapeHighlighter extends DefaultHighlighter {
84          /**
85           * @return raw input
86           */
87          @Override
88          public String escape(String input) {
89              return input;
90          }
91  
92          /**
93           * We need to override this and call it explicitly as otherwise our {@link #escape(String)} method would never be called,
94           * see {@link DefaultHighlighter#highlight(TermPositionVector, Set, String, String, String, String, String, String, String, int, int)} line #109.
95           */
96          @Override
97          protected String doHighlight(TermPositionVector tvec, Set<Term[]> queryTerms, String text, String excerptStart, String excerptEnd, String fragmentStart, String fragmentEnd, String hlStart, String hlEnd, int maxFragments, int surround) throws IOException {
98              return super.doHighlight(tvec, queryTerms, text, excerptStart, excerptEnd, fragmentStart, fragmentEnd, hlStart, hlEnd, maxFragments, surround);
99          }
100     }
101 }