View Javadoc
1   /**
2    * This file Copyright (c) 2012-2018 Magnolia International
3    * Ltd.  (http://www.magnolia-cms.com). All rights reserved.
4    *
5    *
6    * This file is dual-licensed under both the Magnolia
7    * Network Agreement and the GNU General Public License.
8    * You may elect to use one or the other of these licenses.
9    *
10   * This file is distributed in the hope that it will be
11   * useful, but AS-IS and WITHOUT ANY WARRANTY; without even the
12   * implied warranty of MERCHANTABILITY or FITNESS FOR A
13   * PARTICULAR PURPOSE, TITLE, or NONINFRINGEMENT.
14   * Redistribution, except as permitted by whichever of the GPL
15   * or MNA you select, is prohibited.
16   *
17   * 1. For the GPL license (GPL), you can redistribute and/or
18   * modify this file under the terms of the GNU General
19   * Public License, Version 3, as published by the Free Software
20   * Foundation.  You should have received a copy of the GNU
21   * General Public License, Version 3 along with this program;
22   * if not, write to the Free Software Foundation, Inc., 51
23   * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
24   *
25   * 2. For the Magnolia Network Agreement (MNA), this file
26   * and the accompanying materials are made available under the
27   * terms of the MNA which accompanies this distribution, and
28   * is available at http://www.magnolia-cms.com/mna.html
29   *
30   * Any modifications to this file must keep this entire header
31   * intact.
32   *
33   */
34  package info.magnolia.templating.jsp.taglib;
35  
36  import info.magnolia.jcr.util.NodeTypes;
37  import info.magnolia.jcr.util.NodeUtil;
38  
39  import java.io.IOException;
40  import java.util.ArrayList;
41  import java.util.Collection;
42  import java.util.Iterator;
43  import java.util.regex.Pattern;
44  
45  import javax.jcr.Node;
46  import javax.jcr.Property;
47  import javax.jcr.PropertyType;
48  import javax.servlet.jsp.JspException;
49  import javax.servlet.jsp.JspWriter;
50  import javax.servlet.jsp.tagext.TagSupport;
51  
52  import org.apache.commons.lang3.ArrayUtils;
53  import org.apache.commons.lang3.StringUtils;
54  import org.slf4j.Logger;
55  import org.slf4j.LoggerFactory;
56  import org.tldgen.annotations.BodyContent;
57  import org.tldgen.annotations.Tag;
58  
59  
60  /**
61   * Output a set of snippets taken from any paragraph in the given page mathing any of the search term.
62   *
63   * @jsp.tag name="searchResultSnippet" body-content="empty"
64   * @jsp.tag-example <pre>
65   * &lt;cmsu:simplesearch query="${param.search}" var="results" /&gt;
66   * &lt;c:forEach items="${results}" var="page"&gt;
67   *   &lt;cmsu:searchResultSnippet query="${param.search}" page="${page}" /&gt;
68   * &lt;/c:forEach&gt;
69   * </pre>
70   */
71  @Tag(name = "searchResultSnippet", bodyContent = BodyContent.EMPTY)
72  
73  public class SearchResultSnippetTag extends TagSupport {
74  
75      private static final Pattern HTML_STRIP = Pattern.compile("<.*?>", Pattern.DOTALL);
76  
77      private static final Logger log = LoggerFactory.getLogger(SearchResultSnippetTag.class);
78  
79      /**
80       * Start level.
81       */
82      private Node page;
83  
84      /**
85       * Query, natural language.
86       */
87      private String query;
88  
89      /**
90       * Number of chars to include in result.
91       */
92      private int chars = 100;
93  
94      /**
95       * Maximum number of snippets to include in result.
96       */
97      private int maxSnippets = 3;
98  
99      /**
100      * Search query.
101      *
102      * @jsp.attribute required="true" rtexprvalue="true"
103      */
104     public void setQuery(String query) {
105         this.query = query;
106     }
107 
108     /**
109      * Number of characters to include in search snippets. Default is 100.
110      *
111      * @jsp.attribute required="false" rtexprvalue="true" type="int"
112      */
113     public void setChars(int chars) {
114         this.chars = chars;
115     }
116 
117     /**
118      * Maximum number of snippets to print out.
119      *
120      * @jsp.attribute required="false" rtexprvalue="true" type="int"
121      */
122     public void setMaxSnippets(int maxSnippets) {
123         this.maxSnippets = maxSnippets;
124     }
125 
126     /**
127      * A Content node of type mgnl:content (a magnolia page), typically returned by the simpleSearch tag.
128      *
129      * @jsp.attribute required="true" rtexprvalue="true" type="info.magnolia.cms.core.Content"
130      */
131     public void setPage(Node page) {
132         this.page = page;
133     }
134 
135     /**
136      * @see javax.servlet.jsp.tagext.TagSupport#doStartTag()
137      */
138     @Override
139     public int doStartTag() throws JspException {
140 
141         JspWriter out = this.pageContext.getOut();
142         try {
143             Iterator iterator = getSnippets().iterator();
144             while (iterator.hasNext()) {
145                 out.println(iterator.next());
146             }
147         } catch (IOException e) {
148             // should never happen
149             throw new JspException(e);
150         }
151         return EVAL_PAGE;
152     }
153 
154     /**
155      * Extract a collection of snippets from any paragraph in the given page.
156      *
157      * @return a collection of Strings.
158      * @todo avoid overlapping snippets (use regexp insted of simple indexOfs)
159      * @todo only extract snippets from user-configured properties
160      * @todo abbreviate on whitespace and puntuation, detect start of sentences
161      * @todo replace ampersand in regexp
162      * @todo break methods and write junits
163      */
164     public Collection getSnippets() {
165 
166         log.debug("collecting snippets");
167 
168         Collection snippets = new ArrayList();
169         String[] searchTerms = StringUtils.split(this.query);
170 
171         try {
172             Iterator<Node> iterator = NodeUtil.getNodes(this.page, NodeTypes.ContentNode.NAME).iterator();
173 
174             outer:
175             while (iterator.hasNext()) {
176                 Node paragraphCollection = iterator.next();
177 
178                 Iterator<Node> parIterator = NodeUtil.getNodes(paragraphCollection, NodeTypes.ContentNode.NAME).iterator();
179                 while (parIterator.hasNext()) {
180                     Node paragraph = parIterator.next();
181 
182                     log.debug("Iterating on paragraph {}", paragraph);
183 
184                     Iterator dataIterator = paragraph.getProperties();
185 
186                     while (dataIterator.hasNext()) {
187                         Property property = (Property) dataIterator.next();
188                         if (property.getType() != PropertyType.BINARY) {
189 
190                             String resultString = property.getString();
191 
192                             log.debug("Iterating on property {}", property.getName());
193                             log.debug("Property value is {}", resultString);
194 
195                             // a quick and buggy way to avoid configuration properties, we should allow the user to
196                             // configure a list of nodeData to search for...
197                             if (resultString.length() < 20) {
198                                 continue;
199                             }
200 
201                             for (int j = 0; j < searchTerms.length; j++) {
202                                 String searchTerm = StringUtils.lowerCase(searchTerms[j]);
203 
204                                 // exclude keywords and words with less than 2 chars
205                                 if (!ArrayUtils.contains(new String[]{"and", "or"}, searchTerm) && searchTerm.length() > 2) {
206 
207                                     log.debug("Looking for search term [{}] in [{}]", searchTerm, resultString);
208 
209                                     // first check, avoid using heavy string replaceAll operations if the search term is not
210                                     // there
211                                     if (!StringUtils.contains(resultString.toLowerCase(), searchTerm)) {
212                                         continue;
213                                     }
214 
215                                     // strips out html tags using a regexp
216                                     resultString = stripHtmlTags(resultString);
217 
218                                     // only get first matching keyword
219                                     int pos = resultString.toLowerCase().indexOf(searchTerm);
220                                     if (pos > -1) {
221 
222                                         int posEnd = pos + searchTerm.length();
223                                         int from = (pos - chars / 2);
224                                         if (from < 0) {
225                                             from = 0;
226                                         }
227 
228                                         int to = from + chars;
229                                         if (to > resultString.length()) {
230                                             to = resultString.length();
231                                         }
232 
233                                         StringBuffer snippet = new StringBuffer();
234 
235                                         snippet.append(StringUtils.substring(resultString, from, pos));
236                                         snippet.append("<strong>");
237                                         snippet.append(StringUtils.substring(resultString, pos, posEnd));
238                                         snippet.append("</strong>");
239                                         snippet.append(StringUtils.substring(resultString, posEnd, to));
240 
241                                         if (from > 0) {
242                                             snippet.insert(0, "... ");
243                                         }
244                                         if (to < resultString.length()) {
245                                             snippet.append("... ");
246                                         }
247 
248                                         log.debug("Search term found, adding snippet {}", snippet);
249 
250                                         snippets.add(snippet);
251                                         if (snippets.size() >= this.maxSnippets) {
252 
253                                             log.debug("Maximum number of snippets ({}) reached, exiting",
254                                                     Integer.toString(this.maxSnippets));
255 
256                                             break outer;
257                                         }
258                                     }
259                                 }
260                             }
261                         }
262                     }
263                 }
264 
265             }
266             return snippets;
267         } catch (Exception e) {
268             log.error(e.getMessage(), e);
269             return null;
270         }
271     }
272 
273     protected String stripHtmlTags(String input) {
274         return HTML_STRIP.matcher(input).replaceAll("");
275     }
276 
277     /**
278      * @see javax.servlet.jsp.tagext.TagSupport#release()
279      */
280     @Override
281     public void release() {
282         this.query = null;
283         this.page = null;
284         this.chars = 100;
285         this.maxSnippets = 3;
286         super.release();
287     }
288 
289 }