View Javadoc

1   /**
2    * This file Copyright (c) 2012 Magnolia International
3    * Ltd.  (http://www.magnolia-cms.com). All rights reserved.
4    *
5    *
6    * This file is dual-licensed under both the Magnolia
7    * Network Agreement and the GNU General Public License.
8    * You may elect to use one or the other of these licenses.
9    *
10   * This file is distributed in the hope that it will be
11   * useful, but AS-IS and WITHOUT ANY WARRANTY; without even the
12   * implied warranty of MERCHANTABILITY or FITNESS FOR A
13   * PARTICULAR PURPOSE, TITLE, or NONINFRINGEMENT.
14   * Redistribution, except as permitted by whichever of the GPL
15   * or MNA you select, is prohibited.
16   *
17   * 1. For the GPL license (GPL), you can redistribute and/or
18   * modify this file under the terms of the GNU General
19   * Public License, Version 3, as published by the Free Software
20   * Foundation.  You should have received a copy of the GNU
21   * General Public License, Version 3 along with this program;
22   * if not, write to the Free Software Foundation, Inc., 51
23   * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
24   *
25   * 2. For the Magnolia Network Agreement (MNA), this file
26   * and the accompanying materials are made available under the
27   * terms of the MNA which accompanies this distribution, and
28   * is available at http://www.magnolia-cms.com/mna.html
29   *
30   * Any modifications to this file must keep this entire header
31   * intact.
32   *
33   */
34  package info.magnolia.templating.jsp.taglib;
35  
36  import info.magnolia.cms.core.MgnlNodeType;
37  import info.magnolia.jcr.util.NodeUtil;
38  
39  import java.io.IOException;
40  import java.util.ArrayList;
41  import java.util.Collection;
42  import java.util.Iterator;
43  import java.util.regex.Pattern;
44  
45  import javax.jcr.Node;
46  import javax.jcr.Property;
47  import javax.jcr.PropertyType;
48  import javax.servlet.jsp.JspException;
49  import javax.servlet.jsp.JspWriter;
50  import javax.servlet.jsp.tagext.TagSupport;
51  
52  import org.apache.commons.lang.ArrayUtils;
53  import org.apache.commons.lang.StringUtils;
54  import org.apache.commons.lang.exception.NestableRuntimeException;
55  import org.slf4j.Logger;
56  import org.slf4j.LoggerFactory;
57  
58  import org.tldgen.annotations.BodyContent;
59  import org.tldgen.annotations.Tag;
60  
61  
62  /**
63   * Output a set of snippets taken from any paragraph in the given page mathing any of the search term.
64   * @jsp.tag name="searchResultSnippet" body-content="empty"
65   * @jsp.tag-example
66   * <pre>
67   * &lt;cmsu:simplesearch query="${param.search}" var="results" /&gt;
68   * &lt;c:forEach items="${results}" var="page"&gt;
69   *   &lt;cmsu:searchResultSnippet query="${param.search}" page="${page}" /&gt;
70   * &lt;/c:forEach&gt;
71   *</pre>
72   * @author Fabrizio Giustina
73   * @version $Revision$ ($Author$)
74   */
75  @Tag(name="searchResultSnippet", bodyContent=BodyContent.EMPTY)
76  
77  public class SearchResultSnippetTag extends TagSupport {
78  
79      private static final long serialVersionUID = -2788581584059109175L;
80  
81      private static final Pattern HTML_STRIP = Pattern.compile("<.*?>", Pattern.DOTALL);
82  
83      private static final Logger log = LoggerFactory.getLogger(SearchResultSnippetTag.class);
84  
85      /**
86       * Start level.
87       */
88      private Node page;
89  
90      /**
91       * Query, natural language.
92       */
93      private String query;
94  
95      /**
96       * Number of chars to include in result.
97       */
98      private int chars = 100;
99  
100     /**
101      * Maximum number of snippets to include in result.
102      */
103     private int maxSnippets = 3;
104 
105     /**
106      * Search query.
107      * @jsp.attribute required="true" rtexprvalue="true"
108      */
109     public void setQuery(String query) {
110         this.query = query;
111     }
112 
113     /**
114      * Number of characters to include in search snippets. Default is 100.
115      * @jsp.attribute required="false" rtexprvalue="true" type="int"
116      */
117     public void setChars(int chars) {
118         this.chars = chars;
119     }
120 
121     /**
122      * Maximum number of snippets to print out.
123      * @jsp.attribute required="false" rtexprvalue="true" type="int"
124      */
125     public void setMaxSnippets(int maxSnippets) {
126         this.maxSnippets = maxSnippets;
127     }
128 
129     /**
130      * A Content node of type mgnl:content (a magnolia page), typically returned by the simpleSearch tag.
131      * @jsp.attribute required="true" rtexprvalue="true" type="info.magnolia.cms.core.Content"
132      */
133     public void setPage(Node page) {
134         this.page = page;
135     }
136 
137     /**
138      * @see javax.servlet.jsp.tagext.TagSupport#doStartTag()
139      */
140     @Override
141     public int doStartTag() throws JspException {
142 
143         JspWriter out = this.pageContext.getOut();
144         try {
145             Iterator iterator = getSnippets().iterator();
146             while (iterator.hasNext()) {
147                 out.println(iterator.next());
148             }
149         }
150         catch (IOException e) {
151             // should never happen
152             throw new NestableRuntimeException(e);
153         }
154         return EVAL_PAGE;
155     }
156 
157     /**
158      * Extract a collection of snippets from any paragraph in the given page.
159      * @return a collection of Strings.
160      * @todo avoid overlapping snippets (use regexp insted of simple indexOfs)
161      * @todo only extract snippets from user-configured properties
162      * @todo abbreviate on whitespace and puntuation, detect start of sentences
163      * @todo replace ampersand in regexp
164      * @todo break methods and write junits
165      */
166     public Collection getSnippets() {
167 
168         log.debug("collecting snippets");
169 
170         Collection snippets = new ArrayList();
171         String[] searchTerms = StringUtils.split(this.query);
172 
173         try{
174         Iterator<Node> iterator = NodeUtil.getNodes(this.page, MgnlNodeType.NT_CONTENTNODE).iterator();
175 
176         outer : while (iterator.hasNext()) {
177             Node paragraphCollection = iterator.next();
178 
179             Iterator<Node> parIterator = NodeUtil.getNodes(paragraphCollection, MgnlNodeType.NT_CONTENTNODE).iterator();
180             while (parIterator.hasNext()) {
181                 Node paragraph = parIterator.next();
182 
183                 log.debug("Iterating on paragraph {}", paragraph);
184 
185                 Iterator dataIterator = paragraph.getProperties();
186 
187                 while (dataIterator.hasNext()) {
188                     Property property = (Property) dataIterator.next();
189                     if (property.getType() != PropertyType.BINARY) {
190 
191                         String resultString = property.getString();
192 
193                         log.debug("Iterating on property {}", property.getName());
194                         log.debug("Property value is {}", resultString);
195 
196                         // a quick and buggy way to avoid configuration properties, we should allow the user to
197                         // configure a list of nodeData to search for...
198                         if (resultString.length() < 20) {
199                             continue;
200                         }
201 
202                         for (int j = 0; j < searchTerms.length; j++) {
203                             String searchTerm = StringUtils.lowerCase(searchTerms[j]);
204 
205                             // exclude keywords and words with less than 2 chars
206                             if (!ArrayUtils.contains(new String[]{"and", "or"}, searchTerm) && searchTerm.length() > 2) {
207 
208                                 log.debug("Looking for search term [{}] in [{}]", searchTerm, resultString);
209 
210                                 // first check, avoid using heavy string replaceAll operations if the search term is not
211                                 // there
212                                 if (!StringUtils.contains(resultString.toLowerCase(), searchTerm)) {
213                                     continue;
214                                 }
215 
216                                 // strips out html tags using a regexp
217                                 resultString = stripHtmlTags(resultString);
218 
219                                 // only get first matching keyword
220                                 int pos = resultString.toLowerCase().indexOf(searchTerm);
221                                 if (pos > -1) {
222 
223                                     int posEnd = pos + searchTerm.length();
224                                     int from = (pos - chars / 2);
225                                     if (from < 0) {
226                                         from = 0;
227                                     }
228 
229                                     int to = from + chars;
230                                     if (to > resultString.length()) {
231                                         to = resultString.length();
232                                     }
233 
234                                     StringBuffer snippet = new StringBuffer();
235 
236                                     snippet.append(StringUtils.substring(resultString, from, pos));
237                                     snippet.append("<strong>");
238                                     snippet.append(StringUtils.substring(resultString, pos, posEnd));
239                                     snippet.append("</strong>");
240                                     snippet.append(StringUtils.substring(resultString, posEnd, to));
241 
242                                     if (from > 0) {
243                                         snippet.insert(0, "... ");
244                                     }
245                                     if (to < resultString.length()) {
246                                         snippet.append("... ");
247                                     }
248 
249                                     log.debug("Search term found, adding snippet {}", snippet);
250 
251                                     snippets.add(snippet);
252                                     if (snippets.size() >= this.maxSnippets) {
253 
254                                         log.debug("Maximum number of snippets ({}) reached, exiting",
255                                             Integer.toString(this.maxSnippets));
256 
257                                         break outer;
258                                     }
259                                 }
260                             }
261                         }
262                     }
263                 }
264             }
265             
266         }
267         return snippets;
268         }catch(Exception e){
269             log.error(e.getMessage(), e);
270             return null;
271         }
272     }
273 
274     /**
275      * @param resultString
276      * @return
277      */
278     protected String stripHtmlTags(String input) {
279         return HTML_STRIP.matcher(input).replaceAll("");
280     }
281 
282     /**
283      * @see javax.servlet.jsp.tagext.TagSupport#release()
284      */
285     @Override
286     public void release() {
287         this.query = null;
288         this.page = null;
289         this.chars = 100;
290         this.maxSnippets = 3;
291         super.release();
292     }
293 
294 }