View Javadoc
1   /**
2    * This file Copyright (c) 2008-2015 Magnolia International
3    * Ltd.  (http://www.magnolia-cms.com). All rights reserved.
4    *
5    *
6    * This file is dual-licensed under both the Magnolia
7    * Network Agreement and the GNU General Public License.
8    * You may elect to use one or the other of these licenses.
9    *
10   * This file is distributed in the hope that it will be
11   * useful, but AS-IS and WITHOUT ANY WARRANTY; without even the
12   * implied warranty of MERCHANTABILITY or FITNESS FOR A
13   * PARTICULAR PURPOSE, TITLE, or NONINFRINGEMENT.
14   * Redistribution, except as permitted by whichever of the GPL
15   * or MNA you select, is prohibited.
16   *
17   * 1. For the GPL license (GPL), you can redistribute and/or
18   * modify this file under the terms of the GNU General
19   * Public License, Version 3, as published by the Free Software
20   * Foundation.  You should have received a copy of the GNU
21   * General Public License, Version 3 along with this program;
22   * if not, write to the Free Software Foundation, Inc., 51
23   * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
24   *
25   * 2. For the Magnolia Network Agreement (MNA), this file
26   * and the accompanying materials are made available under the
27   * terms of the MNA which accompanies this distribution, and
28   * is available at http://www.magnolia-cms.com/mna.html
29   *
30   * Any modifications to this file must keep this entire header
31   * intact.
32   *
33   */
34  package info.magnolia.module.rssaggregator.importhandler;
35  
36  import static java.lang.String.*;
37  import static org.apache.commons.lang.StringUtils.isEmpty;
38  
39  import info.magnolia.cms.core.Content;
40  import info.magnolia.jcr.util.NodeTypes;
41  import info.magnolia.jcr.util.NodeUtil;
42  import info.magnolia.jcr.util.NodeVisitor;
43  import info.magnolia.jcr.util.PropertyUtil;
44  import info.magnolia.jcr.util.VersionUtil;
45  import info.magnolia.module.data.importer.ImportException;
46  import info.magnolia.module.data.importer.ImportHandler;
47  import info.magnolia.module.data.importer.ImportTarget;
48  import info.magnolia.module.rssaggregator.RSSAggregatorNodeTypes;
49  import info.magnolia.module.rssaggregator.util.Assert;
50  
51  import java.util.ArrayList;
52  import java.util.Collection;
53  import java.util.Collections;
54  import java.util.Date;
55  import java.util.HashSet;
56  import java.util.List;
57  import java.util.Set;
58  
59  import javax.inject.Inject;
60  import javax.jcr.Node;
61  import javax.jcr.NodeIterator;
62  import javax.jcr.RepositoryException;
63  
64  import org.apache.commons.lang.StringUtils;
65  import org.apache.jackrabbit.commons.predicate.Predicate;
66  import org.slf4j.Logger;
67  import org.slf4j.LoggerFactory;
68  
69  import com.sun.syndication.feed.synd.SyndCategory;
70  import com.sun.syndication.feed.synd.SyndContent;
71  import com.sun.syndication.feed.synd.SyndEntry;
72  import com.sun.syndication.feed.synd.SyndFeed;
73  
74  /**
75   * ImportHandler capable of importing RSS and Atom feeds over http for aggregate feeds defined in RSS Aggregator module.
76   * <p/>
77   * Allows optional configuration with a {@link RSSFeedFetcher} implementation of choice, by means of configuring the node:
78   *
79   * <pre>
80   * /data/config/importers/rssaggregator
81   * </pre>
82   *
83   * If no custom implementation is configured, will fall back to using the default {@link SimpleRSSFeedFetcher}.
84   */
85  public class RSSFeedImportHandler extends ImportHandler {
86  
87      private static final Logger log = LoggerFactory.getLogger(RSSFeedImportHandler.class);
88  
89      private RSSFeedFetcher feedFetcher;
90      private AggregateFeedContentMapper aggregateFeedMapper;
91      private FilterPredicateContentMapper filterPredicateMapper;
92  
93      /**
94       * Creates handler with default mappers {@link AggregateFeedContentMapper} and {@link FilterPredicateContentMapper}.
95       */
96      @Inject
97      public RSSFeedImportHandler(RSSFeedFetcher feedFetcher) {
98          setAggregateFeedContentMapper(new AggregateFeedContentMapper());
99          setFilterPredicateContentMapper(new FilterPredicateContentMapper());
100         this.feedFetcher = feedFetcher;
101     }
102 
103 
104     @Override
105     protected synchronized void checkPreConditions() throws ImportException {
106         super.checkPreConditions();
107         log.debug("Using feed fetcher '{}'", feedFetcher.getClass().getName());
108     }
109 
110     @SuppressWarnings("unchecked")
111     @Override
112     public Set doImport(final ImportTarget target, Content root, final Set newContentUUIDs) throws ImportException {
113         try {
114             NodeUtil.visit(root.getJCRNode(), new NodeVisitor() {
115                 @Override
116                 public void visit(Node node) throws RepositoryException {
117                     Set<AggregateFeed> aggregateFeeds = loadAggregates(node);
118                     if (!aggregateFeeds.isEmpty()) {
119                         log.info("Fetching {} aggregate feeds ({} channels)", aggregateFeeds.size(), countChannels(aggregateFeeds));
120                         Set<AggregateFeed> fetchedAggregateFeeds = feedFetcher.fetchAggregateFeeds(aggregateFeeds);
121                         Set<String> newAggregateContentUUIDs = saveAggregates(fetchedAggregateFeeds, node);
122                         newContentUUIDs.addAll(newAggregateContentUUIDs);
123                         node.getSession().save();
124                         log.info("{} completed retrieving of RSS feeds", feedFetcher.getClass().getName());
125                     }
126                 }
127             }, new IsRootOrFolder());
128         } catch (RepositoryException e) {
129             String message = format("Failed to execute import for target '%s', parent node '%s'", target, root);
130             throw new ImportException(message, e);
131         }
132         return newContentUUIDs;
133     }
134 
135     // Helper methods
136 
137     private int countChannels(Set<AggregateFeed> aggregateFeeds) {
138         int channelCount = 0;
139         for (AggregateFeed aggregateFeed : aggregateFeeds) {
140             channelCount += aggregateFeed.getChannels().size();
141         }
142         return channelCount;
143     }
144 
145     /**
146      * Load the {@link AggregateFeed aggregate feed} definitions and their {@link FeedChannel feed channels} from the
147      * Content Repository.
148      *
149      * @param parentNode the parent content node that holds the aggregate feed nodes
150      * @return the aggregate feeds
151      * @throws RepositoryException when an exception occurs accessing the Content Repository
152      */
153     public Set<AggregateFeed> loadAggregates(Node parentNode) throws RepositoryException {
154 
155         List<Node> nodeIterator = NodeUtil.asList(NodeUtil.getNodes(parentNode, RSSAggregatorNodeTypes.RSSAggregator.NAME));
156         Set<AggregateFeed> aggregateFeeds = new HashSet<AggregateFeed>();
157         for (Node aggregateNode : nodeIterator) {
158             AggregateFeed aggregateFeed = aggregateFeedMapper.map(aggregateNode);
159             aggregateFeeds.add(aggregateFeed);
160         }
161         return aggregateFeeds;
162     }
163 
164     /**
165      * Save the {@link FeedChannel#feed feed entry} content contained in the {@link FeedChannel feed channels} of the
166      * given <code>aggregateFeeds</code> as children of the given <code>parentNode</code>. If an {@link AggregateFeed} has
167      * {@link AggregateFilter} defined, feed entries must pass the filter before they will be actually saved in the
168      * Content Repository.
169      *
170      * @param parentNode the parent content node of the aggregate feeds content to save
171      * @param aggregateFeeds the aggregate feeds to save
172      * @return a set of UUIDs of the newly created aggregate content nodes
173      * @throws RepositoryException when an exception occurs accessing the Content Repository
174      */
175     protected Set<String> saveAggregates(Set<AggregateFeed> aggregateFeeds, Node parentNode) throws RepositoryException {
176         Set<String> newAggregateContentUUIDs = new HashSet<String>();
177         for (AggregateFeed aggregateFeed : aggregateFeeds) {
178             Node aggregateNode = loadSingleAggregateNode(parentNode, aggregateFeed.getName());
179             Node dataNode = getOrCreateNode(aggregateNode, "data", NodeTypes.Content.NAME);
180             newAggregateContentUUIDs.add(aggregateNode.getUUID());
181             AggregateFilter aggregateFilter = loadAggregateFilter(aggregateNode);
182             for (FeedChannel channel : aggregateFeed.getChannels()) {
183                 if (channel.hasFeed()) {
184                     saveFeedChannel(channel, aggregateFilter, dataNode);
185                 }
186             }
187         }
188         return newAggregateContentUUIDs;
189     }
190 
191     /**
192      * Load a single aggregate content node from the given <code>parentNode</code> with the given
193      * <code>aggregateName</code>. If no such aggregate could be found, <code>null</code> is returned.
194      *
195      * @param parentNode the parentNode to load the node from
196      * @param aggregateNodeName the name of the aggregate content node to load
197      * @return the aggregate content node, or <code>null</code> if no such node was found
198      * @throws IllegalStateException when multiple aggregate content nodes with the same name are found
199      */
200 
201     protected Node loadSingleAggregateNode(Node parentNode, String aggregateNodeName) throws RepositoryException {
202         // ////////////////
203         // SHOULD BE MOVE IN NodeUtil
204         // ///////////////
205         NodeIterator nodeIterator = parentNode.getNodes(aggregateNodeName);
206         Collection<Node> aggregateNodes = new ArrayList<Node>();
207         while (nodeIterator.hasNext()) {
208             Node currentNode = nodeIterator.nextNode();
209             if (NodeUtil.isNodeType(currentNode, RSSAggregatorNodeTypes.RSSAggregator.NAME)) {
210                 aggregateNodes.add(currentNode);
211             }
212         }
213         // ////////////////
214         // END
215         // ///////////////
216         int size = aggregateNodes.size();
217         if (size > 1) {
218             throw new IllegalStateException(format(
219                     "Expected content node '%s' to have at most 1 child named '%s' of item type '%s', but found %s",
220                     parentNode, aggregateNodeName, RSSAggregatorNodeTypes.RSSAggregator.NAME, size));
221         }
222         if (aggregateNodes.isEmpty()) {
223             return null;
224         }
225         return aggregateNodes.iterator().next();
226     }
227 
228     /**
229      * Behaves exactly like {@link NodeUtil#createPath(Node contentNode, String name, String itemType, boolean save)}. This method exists for
230      * testability.
231      *
232      * @param contentNode the contentNode to (create if non-existent and then) get
233      * @param name the name of the node
234      * @param itemType the type of the content node
235      * @return the created content node
236      * @throws RepositoryException when an exception occurs accessing the Content Repository
237      */
238     protected Node getOrCreateNode(Node contentNode, String name, String itemType) throws RepositoryException {
239         return NodeUtil.createPath(contentNode, name, itemType, true);
240     }
241 
242     /**
243      * Load the {@link AggregateFilter} for the {@link AggregateFeed} which is represented by the given
244      * <code>aggregateNode</code>. Only
245      *
246      * @param aggregateNode the content node representing the AggregateFeed to load the AggregateFilter for
247      * @return the aggregate filter
248      * @throws RepositoryException when an exception occurs accessing the Content Repository
249      */
250     public AggregateFilter loadAggregateFilter(Node aggregateNode) throws RepositoryException {
251         Node filtersNode = aggregateNode.hasNode("filters") ? aggregateNode.getNode("filters") : null;
252         if (filtersNode == null) {
253             return new AggregateFilter(Collections.<FilterPredicate>emptySet());
254         }
255         Set<FilterPredicate> filters = new HashSet<FilterPredicate>();
256         List<Node> filterNodes = NodeUtil.asList(NodeUtil.getNodes(filtersNode, VersionUtil.getNodeTypeName(filtersNode)));
257 
258         for (Node n : filterNodes) {
259             FilterPredicate filterPredicate = filterPredicateMapper.map(n);
260             if (filterPredicate == null) {
261                 continue;
262             }
263             filters.add(filterPredicate);
264         }
265         return new AggregateFilter(filters);
266     }
267 
268     /**
269      * Save the {@link SyndFeed#getEntries() entries} contained {@link FeedChannel#feed in} the given
270      * {@link FeedChannel} that pass the given {@link AggregateFilter} in the provided <code>dataNode</code>.
271      *
272      * @param dataNode the content node to store the feed content under
273      * @param feedChannel the feed channel to save
274      * @param aggregateFilter the aggregate filter to apply to entries in the feed channel
275      * @throws RepositoryException when an exception occurs accessing the Content Repository
276      */
277     @SuppressWarnings("unchecked")
278     protected Node saveFeedChannel(FeedChannel feedChannel, AggregateFilter aggregateFilter, Node dataNode) throws RepositoryException {
279         Node channelNode = recreateFeedChannelNode(feedChannel, dataNode);
280         List<SyndEntry> entries = feedChannel.getFeed().getEntries();
281         int size = entries.size();
282         for (int i = 0; i < size; i++) {
283             SyndEntry entry = entries.get(i);
284             String entryName = format("entry-%s", i);
285             if (aggregateFilter.include(entry)) {
286                 createFeedChannelEntryNode(entry, entryName, channelNode);
287             }
288         }
289         return channelNode;
290     }
291 
292     /**
293      * Recreate the feed channel content node the given feed channel in the Content Repository.
294      *
295      * @param dataNode the node to store the feed channel under
296      * @param feedChannel the feed channel to recreate
297      * @return the created feed channel content node
298      * @throws RepositoryException when an exception occurs accessing the Content Repository
299      */
300     protected Node recreateFeedChannelNode(FeedChannel feedChannel, Node dataNode) throws RepositoryException {
301         String channelName = feedChannel.getName();
302         if (dataNode.hasNode(channelName)) {
303             String absPath = dataNode.getNode(channelName).getPath();
304             dataNode.getSession().removeItem(absPath);
305         }
306         Node channelNode = NodeUtil.createPath(dataNode, channelName, NodeTypes.Content.NAME, true);
307 
308         SyndFeed feed = feedChannel.getFeed();
309         channelNode.setProperty("description", feed.getDescription()); // 'My Blog'
310         channelNode.setProperty("link", feed.getLink()); // 'http://domain.com'
311         channelNode.setProperty("rss", feedChannel.getUrl()); // 'http://domain.com/channel.rss'
312         channelNode.setProperty("title", !isEmpty(feedChannel.getTitle()) ? feedChannel.getTitle() : feed.getTitle());
313         channelNode.setProperty("type", feed.getFeedType()); // 'rss_2.0'
314         return channelNode;
315     }
316 
317     /**
318      * Create a feed channel entry node under the given <code>channelNode</code> with the given <code>nodeName</code>
319      * for the given <code>entry</code>.
320      *
321      * @param entry the feed channel entry to save
322      * @param nodeName the name of the feed channel entry node to create
323      * @param channelNode the feed channel content node to create the feed channel entry under
324      * @throws RepositoryException when an exception occurs accessing the Content Repository
325      */
326     protected Node createFeedChannelEntryNode(SyndEntry entry, String nodeName, Node channelNode) throws RepositoryException {
327         Node entryNode = NodeUtil.createPath(channelNode, nodeName, NodeTypes.Content.NAME, true);
328         entryNode.setProperty("author", entry.getAuthor() == null ? "" : entry.getAuthor());
329         entryNode.setProperty("channelTitle", PropertyUtil.getString(channelNode, "title"));
330         final SyndContent description = entry.getDescription();
331 
332         String descriptionString;
333         if (description != null && StringUtils.isNotBlank(description.getValue())) {
334             descriptionString = description.getValue();
335         } else {
336             descriptionString = getEntryContent(entry);
337         }
338 
339         entryNode.setProperty("description", descriptionString);
340         entryNode.setProperty("content", getEntryContent(entry));
341         entryNode.setProperty("link", entry.getLink());
342         Date publishedDate = entry.getPublishedDate();
343         if (publishedDate == null) {
344             publishedDate = new Date();
345         }
346         entryNode.setProperty("pubDate", publishedDate.getTime());
347         entryNode.setProperty("title", entry.getTitle());
348 
349         createCategoriesNode(entry, entryNode);
350         return entryNode;
351     }
352 
353     /**
354      * Retrieves the main content of an entry.
355      *
356      * @param entry Feed entry holding all data.
357      * @return Entry content as String or empty String if no content is available.
358      */
359     protected String getEntryContent(SyndEntry entry) {
360         String entryContent = "";
361 
362         if (entry != null && entry.getContents().size() > 0) {
363             @SuppressWarnings("unchecked")
364             final List<SyndContent> contents = entry.getContents();
365             for (SyndContent content : contents) {
366                 if (StringUtils.equalsIgnoreCase("html", content.getType()) && StringUtils.isNotBlank(content.getType())) {
367                     entryContent = content.getValue();
368                     break;
369                 }
370             }
371         }
372         return entryContent;
373     }
374 
375     @SuppressWarnings("unchecked")
376     protected Node createCategoriesNode(SyndEntry entry, Node entryNode) throws RepositoryException {
377         Node categoriesNode = NodeUtil.createPath(entryNode, "categories", NodeTypes.Content.NAME, true);
378         List<SyndCategory> categories = entry.getCategories();
379         for (int i = 0; i < categories.size(); i++) {
380             SyndCategory category = categories.get(i);
381             String categoryIndex = valueOf(i);
382             String categoryName = category.getName();
383             categoriesNode.setProperty(categoryIndex, categoryName);
384         }
385         return categoriesNode;
386     }
387 
388     // Getters & setters
389 
390     /**
391      * for testing.
392      */
393     protected AggregateFeedContentMapper setAggregateFeedContentMapper(AggregateFeedContentMapper aggregateFeedMapper) {
394         Assert.notNull(aggregateFeedMapper, "'aggregateFeedContentMapper' must not be null");
395         this.aggregateFeedMapper = aggregateFeedMapper;
396         return this.aggregateFeedMapper;
397     }
398 
399     /**
400      * for testing.
401      */
402     public RSSFeedFetcher setFeedFetcher(RSSFeedFetcher rssFeedFetcher) {
403         Assert.notNull(rssFeedFetcher, "'rssFeedFetcher' must not be null");
404         this.feedFetcher = rssFeedFetcher;
405         return this.feedFetcher;
406     }
407 
408 
409     /**
410      * for testing.
411      */
412     protected FilterPredicateContentMapper setFilterPredicateContentMapper(FilterPredicateContentMapper filterPredicateMapper) {
413         Assert.notNull(filterPredicateMapper, "'filterPredicateContentMapper' must not be null");
414         this.filterPredicateMapper = filterPredicateMapper;
415         return this.filterPredicateMapper;
416     }
417 
418     private static class IsRootOrFolder implements Predicate {
419 
420         public IsRootOrFolder() {
421         }
422 
423         @Override
424         public boolean evaluate(Object object) {
425             if (object instanceof Node) {
426                 Node node = (Node) object;
427                 try {
428                     return NodeUtil.isNodeType(node, NodeTypes.Folder.NAME) || "/".equals(node.getPath());
429                 } catch (RepositoryException e) {
430                     log.warn("Failed to check predicate on node: " + NodeUtil.getPathIfPossible(node));
431                 }
432             }
433             return false;
434         }
435     }
436 }