View Javadoc

1   /**
2    * This file Copyright (c) 2003-2010 Magnolia International
3    * Ltd.  (http://www.magnolia-cms.com). All rights reserved.
4    *
5    *
6    * This file is dual-licensed under both the Magnolia
7    * Network Agreement and the GNU General Public License.
8    * You may elect to use one or the other of these licenses.
9    *
10   * This file is distributed in the hope that it will be
11   * useful, but AS-IS and WITHOUT ANY WARRANTY; without even the
12   * implied warranty of MERCHANTABILITY or FITNESS FOR A
13   * PARTICULAR PURPOSE, TITLE, or NONINFRINGEMENT.
14   * Redistribution, except as permitted by whichever of the GPL
15   * or MNA you select, is prohibited.
16   *
17   * 1. For the GPL license (GPL), you can redistribute and/or
18   * modify this file under the terms of the GNU General
19   * Public License, Version 3, as published by the Free Software
20   * Foundation.  You should have received a copy of the GNU
21   * General Public License, Version 3 along with this program;
22   * if not, write to the Free Software Foundation, Inc., 51
23   * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
24   *
25   * 2. For the Magnolia Network Agreement (MNA), this file
26   * and the accompanying materials are made available under the
27   * terms of the MNA which accompanies this distribution, and
28   * is available at http://www.magnolia-cms.com/mna.html
29   *
30   * Any modifications to this file must keep this entire header
31   * intact.
32   *
33   */
34  package info.magnolia.cms.taglibs.util;
35  
36  import info.magnolia.cms.core.Content;
37  import info.magnolia.cms.core.ItemType;
38  import info.magnolia.cms.core.NodeData;
39  
40  import java.io.IOException;
41  import java.util.ArrayList;
42  import java.util.Collection;
43  import java.util.Iterator;
44  import java.util.regex.Pattern;
45  
46  import javax.jcr.PropertyType;
47  import javax.servlet.jsp.JspException;
48  import javax.servlet.jsp.JspWriter;
49  import javax.servlet.jsp.tagext.TagSupport;
50  
51  import org.apache.commons.lang.ArrayUtils;
52  import org.apache.commons.lang.StringUtils;
53  import org.apache.commons.lang.exception.NestableRuntimeException;
54  import org.slf4j.Logger;
55  import org.slf4j.LoggerFactory;
56  
57  
58  /**
59   * Output a set of snippets taken from any paragraph in the given page mathing any of the search term.
60   * @jsp.tag name="searchResultSnippet" body-content="empty"
61   * @jsp.tag-example
62   * <cmsu:simplesearch query="${param.search}" var="results" />
63   * <c:forEach items="${results}" var="page">
64   *   <cmsu:searchResultSnippet query="${param.search}" page="${page}" />
65   * </c:forEach>
66   *
67   * @author Fabrizio Giustina
68   * @version $Revision: 32667 $ ($Author: gjoseph $)
69   */
70  public class SearchResultSnippetTag extends TagSupport {
71      private static final Pattern HTML_STRIP = Pattern.compile("<.*?>", Pattern.DOTALL);
72  
73      private static final Logger log = LoggerFactory.getLogger(SearchResultSnippetTag.class);
74  
75      /**
76       * Start level.
77       */
78      private Content page;
79  
80      /**
81       * Query, natural language.
82       */
83      private String query;
84  
85      /**
86       * Number of chars to include in result.
87       */
88      private int chars = 100;
89  
90      /**
91       * Maximum number of snippets to include in result.
92       */
93      private int maxSnippets = 3;
94  
95      /**
96       * Search query.
97       * @jsp.attribute required="true" rtexprvalue="true"
98       */
99      public void setQuery(String query) {
100         this.query = query;
101     }
102 
103     /**
104      * Number of characters to include in search snippets. Default is 100.
105      * @jsp.attribute required="false" rtexprvalue="true" type="int"
106      */
107     public void setChars(int chars) {
108         this.chars = chars;
109     }
110 
111     /**
112      * Maximum number of snippets to print out.
113      * @jsp.attribute required="false" rtexprvalue="true" type="int"
114      */
115     public void setMaxSnippets(int maxSnippets) {
116         this.maxSnippets = maxSnippets;
117     }
118 
119     /**
120      * A Content node of type mgnl:content (a magnolia page), typically returned by the simpleSearch tag.
121      * @jsp.attribute required="true" rtexprvalue="true" type="info.magnolia.cms.core.Content"
122      */
123     public void setPage(Content page) {
124         this.page = page;
125     }
126 
127     /**
128      * @see javax.servlet.jsp.tagext.TagSupport#doStartTag()
129      */
130     public int doStartTag() throws JspException {
131 
132         JspWriter out = this.pageContext.getOut();
133         try {
134             Iterator iterator = getSnippets().iterator();
135             while (iterator.hasNext()) {
136                 out.println(iterator.next());
137             }
138         }
139         catch (IOException e) {
140             // should never happen
141             throw new NestableRuntimeException(e);
142         }
143         return EVAL_PAGE;
144     }
145 
146     /**
147      * Extract a collection of snippets from any paragraph in the given page.
148      * @return a collection of Strings.
149      * @todo avoid overlapping snippets (use regexp insted of simple indexOfs)
150      * @todo only extract snippets from user-configured properties
151      * @todo abbreviate on whitespace and puntuation, detect start of sentences
152      * @todo replace ampersand in regexp
153      * @todo break methods and write junits
154      */
155     public Collection getSnippets() {
156 
157         log.debug("collecting snippets"); //$NON-NLS-1$
158 
159         Collection snippets = new ArrayList();
160         String[] searchTerms = StringUtils.split(this.query);
161 
162         Collection paragraphCollections = this.page.getChildren(ItemType.CONTENTNODE);
163 
164         Iterator iterator = paragraphCollections.iterator();
165         outer : while (iterator.hasNext()) {
166             Content paragraphCollection = (Content) iterator.next();
167 
168             Collection paragraphs = paragraphCollection.getChildren();
169 
170             Iterator parIterator = paragraphs.iterator();
171             while (parIterator.hasNext()) {
172                 Content paragraph = (Content) parIterator.next();
173 
174                 log.debug("Iterating on paragraph {}", paragraph); //$NON-NLS-1$
175 
176                 Collection properties = paragraph.getNodeDataCollection();
177 
178                 Iterator dataIterator = properties.iterator();
179                 while (dataIterator.hasNext()) {
180                     NodeData property = (NodeData) dataIterator.next();
181                     if (property.getType() != PropertyType.BINARY) {
182 
183                         String resultString = property.getString();
184 
185                         log.debug("Iterating on property {}", property.getName()); //$NON-NLS-1$
186                         log.debug("Property value is {}", resultString); //$NON-NLS-1$
187 
188                         // a quick and buggy way to avoid configuration properties, we should allow the user to
189                         // configure a list of nodeData to search for...
190                         if (resultString.length() < 20) {
191                             continue;
192                         }
193 
194                         for (int j = 0; j < searchTerms.length; j++) {
195                             String searchTerm = StringUtils.lowerCase(searchTerms[j]);
196 
197                             // exclude keywords and words with less than 2 chars
198                             if (!ArrayUtils.contains(SimpleSearchTag.KEYWORDS, searchTerm) && searchTerm.length() > 2) {
199 
200                                 log.debug("Looking for search term [{}] in [{}]", searchTerm, resultString); //$NON-NLS-1$
201 
202                                 // first check, avoid using heavy string replaceAll operations if the search term is not
203                                 // there
204                                 if (!StringUtils.contains(resultString.toLowerCase(), searchTerm)) {
205                                     continue;
206                                 }
207 
208                                 // strips out html tags using a regexp
209                                 resultString = stripHtmlTags(resultString);
210 
211                                 // only get first matching keyword
212                                 int pos = resultString.toLowerCase().indexOf(searchTerm);
213                                 if (pos > -1) {
214 
215                                     int posEnd = pos + searchTerm.length();
216                                     int from = (pos - chars / 2);
217                                     if (from < 0) {
218                                         from = 0;
219                                     }
220 
221                                     int to = from + chars;
222                                     if (to > resultString.length()) {
223                                         to = resultString.length();
224                                     }
225 
226                                     StringBuffer snippet = new StringBuffer();
227 
228                                     snippet.append(StringUtils.substring(resultString, from, pos));
229                                     snippet.append("<strong>"); //$NON-NLS-1$
230                                     snippet.append(StringUtils.substring(resultString, pos, posEnd));
231                                     snippet.append("</strong>"); //$NON-NLS-1$
232                                     snippet.append(StringUtils.substring(resultString, posEnd, to));
233 
234                                     if (from > 0) {
235                                         snippet.insert(0, "... "); //$NON-NLS-1$
236                                     }
237                                     if (to < resultString.length()) {
238                                         snippet.append("... "); //$NON-NLS-1$
239                                     }
240 
241                                     log.debug("Search term found, adding snippet {}", snippet); //$NON-NLS-1$
242 
243                                     snippets.add(snippet);
244                                     if (snippets.size() >= this.maxSnippets) {
245 
246                                         log.debug("Maximum number of snippets ({}) reached, exiting", //$NON-NLS-1$
247                                             Integer.toString(this.maxSnippets));
248 
249                                         break outer;
250                                     }
251                                 }
252                             }
253                         }
254                     }
255                 }
256             }
257         }
258 
259         return snippets;
260     }
261 
262     /**
263      * @param resultString
264      * @return
265      */
266     protected String stripHtmlTags(String input) {
267         return HTML_STRIP.matcher(input).replaceAll("");
268     }
269 
270     /**
271      * @see javax.servlet.jsp.tagext.TagSupport#release()
272      */
273     public void release() {
274         this.query = null;
275         this.page = null;
276         this.chars = 100;
277         this.maxSnippets = 3;
278         super.release();
279     }
280 
281 }