View Javadoc

1   /**
2    * This file Copyright (c) 2003-2013 Magnolia International
3    * Ltd.  (http://www.magnolia-cms.com). All rights reserved.
4    *
5    *
6    * This file is dual-licensed under both the Magnolia
7    * Network Agreement and the GNU General Public License.
8    * You may elect to use one or the other of these licenses.
9    *
10   * This file is distributed in the hope that it will be
11   * useful, but AS-IS and WITHOUT ANY WARRANTY; without even the
12   * implied warranty of MERCHANTABILITY or FITNESS FOR A
13   * PARTICULAR PURPOSE, TITLE, or NONINFRINGEMENT.
14   * Redistribution, except as permitted by whichever of the GPL
15   * or MNA you select, is prohibited.
16   *
17   * 1. For the GPL license (GPL), you can redistribute and/or
18   * modify this file under the terms of the GNU General
19   * Public License, Version 3, as published by the Free Software
20   * Foundation.  You should have received a copy of the GNU
21   * General Public License, Version 3 along with this program;
22   * if not, write to the Free Software Foundation, Inc., 51
23   * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
24   *
25   * 2. For the Magnolia Network Agreement (MNA), this file
26   * and the accompanying materials are made available under the
27   * terms of the MNA which accompanies this distribution, and
28   * is available at http://www.magnolia-cms.com/mna.html
29   *
30   * Any modifications to this file must keep this entire header
31   * intact.
32   *
33   */
34  package info.magnolia.module.rssaggregator.generator;
35  
36  import info.magnolia.commands.MgnlCommand;
37  import info.magnolia.context.Context;
38  import info.magnolia.context.MgnlContext;
39  import info.magnolia.jcr.util.NodeUtil;
40  import info.magnolia.jcr.util.PropertyUtil;
41  import info.magnolia.jcr.util.VersionUtil;
42  import info.magnolia.module.data.DataConsts;
43  import info.magnolia.module.rssaggregator.importhandler.FilterPredicate;
44  import info.magnolia.module.rssaggregator.importhandler.FilterPredicateContentMapper;
45  import info.magnolia.module.rssaggregator.importhandler.PlanetFilter;
46  import info.magnolia.module.rssaggregator.util.PlanetUtil;
47  
48  import java.math.BigInteger;
49  import java.security.MessageDigest;
50  import java.util.HashSet;
51  import java.util.List;
52  import java.util.Set;
53  
54  import javax.jcr.Node;
55  import javax.jcr.NodeIterator;
56  import javax.jcr.RepositoryException;
57  import javax.jcr.Session;
58  
59  import org.apache.commons.lang.StringUtils;
60  import org.slf4j.Logger;
61  import org.slf4j.LoggerFactory;
62  
63  import com.sun.syndication.feed.synd.SyndContentImpl;
64  import com.sun.syndication.feed.synd.SyndEntry;
65  import com.sun.syndication.feed.synd.SyndEntryImpl;
66  
67  /**
68   * Generates the data for the planet stuff.
69   *
70   * @author lfischer
71   */
72  public class PlanetDataGenerator extends MgnlCommand {
73  
74      private static final Logger log = LoggerFactory.getLogger(PlanetDataGenerator.class);
75  
76      private static final String PLANET_DATANODE_NAME = "planetData";
77      private static final String CONTENTTYPE_RSSAGGREGATOR = "RssAggregator";
78      private static final String POSTS_PREFIX = "posts-";
79      private static final String POSTS_FIRST = POSTS_PREFIX + "00000";
80      private static final String POST_ENTRY_PREFIX = "entry-";
81      private static final int MAX_NODE_ENTRIES = 999;
82  
83      private Session session;
84  
85      @Override
86      public boolean execute(Context context) throws Exception {
87          log.info("Starting command for Planet post archive.");
88  
89          // session = context.getJCRSession("data");
90          session = MgnlContext.getSystemContext().getJCRSession("data");
91  
92          generatePlanetFeedData();
93  
94          log.info("Finished generating Planet archive.");
95  
96          return true;
97      }
98  
99      /**
100      * Process all RSS Aggregator feeds and create an archive of feed items if the feed is marked as planet feed.
101      */
102     void generatePlanetFeedData() {
103         try {
104             //NodeIterator feeds = SessionUtil.getNode("data", "/rssaggregator").getNodes();
105             NodeIterator feeds = session.getNode("/rssaggregator").getNodes();
106 
107             while (feeds.hasNext()) {
108                 Node feedNode = feeds.nextNode();
109                 if (NodeUtil.isNodeType(feedNode, CONTENTTYPE_RSSAGGREGATOR)) {
110                     // only generate feed archive if the feed is marked as planet feed in the data module dialog
111                     if (PlanetUtil.isPlanetNode(feedNode)) {
112                         log.info("Storing data for planet feed " + feedNode.getName());
113                         // create planetDataMode if it not exists
114                         Node planetData = NodeUtil.createPath(feedNode, PLANET_DATANODE_NAME, DataConsts.MODULE_DATA_CONTENT_NODE_TYPE, true);
115 
116                         if (feedNode.hasNode("data")) {
117                             // Get the node with the feeds for Hackergotchi lookup
118                             NodeIterator channels = feedNode.getNode("data").getNodes();
119                             // load filters
120                             Set<FilterPredicate> planetFilters = loadPlanetFilters(feedNode);
121 
122                             while (channels.hasNext()) {
123                                 Node channel = channels.nextNode();
124                                 processChannelEntries(planetData, channel, planetFilters);
125                             }
126                         }
127                     } else {
128                         log.info("Items of feed " + feedNode.getName() + " will not be archived because the feed is not marked as Planet feed.");
129                     }
130                 }
131             }
132         } catch (RepositoryException e) {
133             log.error("Problem while copying feed data for planet: " + e.getMessage());
134         }
135     }
136 
137     /**
138      * Iterate through feed data and create items in the planet archive.
139      *
140      * @param planetData Parent node where feed items are stored.
141      * @param channel The current feed channel where current feed data are stored.
142      * @param planetFilters Filter list from data dialog.
143      */
144     void processChannelEntries(Node planetData, Node channel, Set<FilterPredicate> planetFilters) {
145         try {
146             String postsParentNode = getPostsParent(planetData, channel.getNodes().getSize());
147 
148             Node target = NodeUtil.createPath(planetData, postsParentNode, DataConsts.MODULE_DATA_CONTENT_NODE_TYPE, true);
149             long entryCount = target.getNodes().getSize();
150             NodeIterator entries = channel.getNodes();
151             while (entries.hasNext()) {
152                 Node entry = entries.nextNode();
153                 entryCount += 1;
154                 createPlanetEntry(target, entry, entryCount, planetFilters);
155             }
156         } catch (RepositoryException e) {
157             log.error("Problem while processing channel entries: " + e.getMessage());
158         }
159     }
160 
161     /**
162      * Store a feed item within the planet archive.
163      *
164      * @param targetPath Parent node of the feed item.
165      * @param srcEntry Original feed item imported by the RSS feed fetcher.
166      * @param currEntry Number of current entry to be stored.
167      * @param planetFilters Filter list from data dialog.
168      */
169     void createPlanetEntry(Node targetPath, Node srcEntry, long currEntry, Set<FilterPredicate> planetFilters) {
170         try {
171             String author = PlanetUtil.formatName(srcEntry, "author");
172             String channelTitle = PlanetUtil.formatName(srcEntry, "channelTitle");
173             String title = PropertyUtil.getString(srcEntry, "title", "");
174             String description;
175             if (StringUtils.isNotBlank(PropertyUtil.getString(srcEntry, "content", ""))) {
176                 description = PropertyUtil.getString(srcEntry, "content", "");
177             } else {
178                 description = PropertyUtil.getString(srcEntry, "description", "");
179             }
180 
181             String link = PropertyUtil.getString(srcEntry, "link", "");
182             Long pubDate = srcEntry.hasProperty("pubDate") ? srcEntry.getProperty("pubDate").getLong() : null;
183 
184             // only store the post entry if all necessary attributes are there
185             if ((StringUtils.isBlank(author) || StringUtils.isBlank(channelTitle)) || StringUtils.isBlank(title)
186                     || StringUtils.isBlank(description) || StringUtils.isBlank(link) || pubDate == null) {
187                 log.error("Found entry with missing mandatory attributes. The post will not be included in the planet archive.");
188             } else {
189                 // build the checksums
190                 String checksum1 = getPostChecksum(author + channelTitle + title + description + link + String.valueOf(pubDate));
191                 String checksum2 = getPostChecksum(author + description);
192 
193                 // sibling to the target path, will also be checked for duplicates
194                 Node targetSibling = NodeUtil.getSiblingBefore(targetPath);
195 
196                 // only add the post if it doesn't exist
197                 if (!postExists(targetPath, checksum1, checksum2) && !postExists(targetSibling, checksum1, checksum2)) {
198                     // get the parent node for more information
199 
200                     if (includePost(planetFilters, srcEntry)) {
201                         Node channelNode = srcEntry.getParent();
202 
203                         Node trgEntry = NodeUtil.createPath(targetPath, POST_ENTRY_PREFIX + currEntry, DataConsts.MODULE_DATA_CONTENT_NODE_TYPE, true);
204                         PropertyUtil.setProperty(trgEntry, "checksum1", checksum1);
205                         PropertyUtil.setProperty(trgEntry, "checksum2", checksum2);
206                         PropertyUtil.setProperty(trgEntry, "author", author);
207                         PropertyUtil.setProperty(trgEntry, "channelTitle", channelTitle);
208                         PropertyUtil.setProperty(trgEntry, "title", title);
209                         PropertyUtil.setProperty(trgEntry, "description", description);
210                         PropertyUtil.setProperty(trgEntry, "link", link);
211                         PropertyUtil.setProperty(trgEntry, "pubDate", pubDate);
212                         PropertyUtil.setProperty(trgEntry, "authorLink", PropertyUtil.getString(channelNode, "link", ""));
213                         PropertyUtil.setProperty(trgEntry, "rssLink", PropertyUtil.getString(channelNode, "rss", ""));
214                         PropertyUtil.setProperty(trgEntry, "hidden", false);
215 
216                         session.save();
217                     } else {
218                         log.info("Post was not included because filter setting didn't match: " + StringUtils.abbreviate(title, 60));
219                     }
220                 } else {
221                     log.info("Found already existing post: " + StringUtils.abbreviate(title, 60));
222                 }
223             }
224         } catch (RepositoryException e) {
225             log.error("Problem while creating planet entry: " + e.getMessage());
226         }
227     }
228 
229     /**
230      * Determine if a feed item already exists based on checksums of the content.
231      *
232      * @param targetPath Archive node (parent) containing the entries to be checked.
233      * @param check1 First checksum to compare.
234      * @param check2 Second checksum to compare.
235      * @return true, if the entry already exists, otherwise false
236      */
237     boolean postExists(Node targetPath, String check1, String check2) {
238         boolean found = false;
239         if (targetPath != null) {
240             try {
241                 NodeIterator targetEntries = targetPath.getNodes();
242                 while (targetEntries.hasNext()) {
243                     Node existing = targetEntries.nextNode();
244 
245                     String extCheck1 = "";
246                     if (existing.hasProperty("checksum1")) {
247                         extCheck1 = existing.getProperty("checksum1").getString();
248                     }
249                     String extCheck2 = "";
250                     if (existing.hasProperty("checksum2")) {
251                         extCheck2 = existing.getProperty("checksum2").getString();
252                     }
253 
254                     if (StringUtils.equals(extCheck1, check1) || StringUtils.equals(extCheck2, check2)) {
255                         found = true;
256                         break;
257                     }
258                 }
259             } catch (RepositoryException e) {
260                 log.error("Problem while searching for post: " + e.getMessage());
261             }
262         }
263         return found;
264     }
265 
266     boolean includePost(Set<FilterPredicate> planetFilters, Node srcNode) {
267         if (planetFilters != null && planetFilters.size() > 0) {
268 
269             try {
270                 PlanetFilter planetFilter = new PlanetFilter(planetFilters);
271 
272                 SyndEntry entry = new SyndEntryImpl();
273                 String author = PlanetUtil.formatName(srcNode, "author");
274                 if (StringUtils.isNotBlank(author)) {
275                     entry.setAuthor(author);
276                 } else {
277                     entry.setAuthor(PlanetUtil.formatName(srcNode, "channelTitle"));
278                 }
279                 entry.setTitle(PropertyUtil.getString(srcNode, "title"));
280                 SyndContentImpl description = new SyndContentImpl();
281                 description.setValue(PropertyUtil.getString(srcNode, "description", ""));
282                 entry.setDescription(description);
283 
284                 return planetFilter.include(entry);
285             } catch (RepositoryException e) {
286                 log.error("Problem while filtering planet feed content: " + e.getMessage());
287             }
288         }
289 
290         return true;
291     }
292 
293     /**
294      * Create the name for the parent folder of archived post entries.
295      *
296      * This method is used to avoid having to many node entries under one parent node.
297      * It retrieves the newest archive folder and checks if the total number of existing entries under this node
298      * plus the additonal ones from the current run will exceed the number of maximum nodes allowed under one node.
299      *
300      * If the calculated number is below the maximum, the latest archive folder will be used. If the number reaches the
301      * maximum, a new node will be created for storage.
302      *
303      * @param pdNode Parent node where planet archive data are stored in
304      * @param entryCount number of entries that need to be added
305      * @return Name of the archive storage node
306      */
307     String getPostsParent(Node pdNode, long entryCount) {
308         String postsFolder = POSTS_FIRST;
309         int archCount;
310         try {
311             // get latest folder with posts
312             archCount = pdNode.getNodes().getSize() > 0 ? (int) (pdNode.getNodes().getSize() - 1) : 0;
313             postsFolder = POSTS_PREFIX + StringUtils.leftPad(String.valueOf(archCount), 5, "0");
314 
315             // check how many entries this node already has
316             Node postsNode = NodeUtil.createPath(pdNode, postsFolder, DataConsts.MODULE_DATA_CONTENT_NODE_TYPE, true);
317             long existingEntries = postsNode.getNodes().getSize();
318 
319             // check if we need a new parent node to avoid having to many netries under one parent node
320             if ((existingEntries + entryCount) > MAX_NODE_ENTRIES) {
321                 postsFolder = POSTS_PREFIX + StringUtils.leftPad(String.valueOf(archCount + 1), 5, "0");
322             }
323         } catch (RepositoryException e) {
324             log.error("Problem while getting number of highest posts node: " + e.getMessage());
325         }
326         return postsFolder;
327     }
328 
329     /**
330      * Create a checksum from a String input.
331      *
332      * @param postContent String content as checkusm input
333      * @return MD5 checksum
334      */
335     String getPostChecksum(String postContent) {
336         String checksum = null;
337         MessageDigest md;
338 
339         try {
340             md = MessageDigest.getInstance("MD5");
341             md.reset();
342             md.update(postContent.getBytes(), 0, postContent.length());
343             checksum = new BigInteger(1, md.digest()).toString(16);
344         } catch (Exception e) {
345             log.error("Problem while creating checksum for post: " + e.getMessage());
346         }
347         return checksum;
348     }
349 
350     Set<FilterPredicate> loadPlanetFilters(Node feedNode) {
351         Set<FilterPredicate> planetFilters = new HashSet<FilterPredicate>();
352 
353         try {
354             Node filtersNode = feedNode.hasNode("filters") ? feedNode.getNode("filters") : null;
355 
356             if (filtersNode != null) {
357                 FilterPredicateContentMapper filterPredicateMapper = new FilterPredicateContentMapper();
358                 List<Node> filterNodes = NodeUtil.asList(NodeUtil.getNodes(filtersNode, VersionUtil.getNodeTypeName(filtersNode)));
359 
360                 for (Node n : filterNodes) {
361                     FilterPredicate filterPredicate = filterPredicateMapper.map(n);
362                     if (filterPredicate == null) {
363                         continue;
364                     }
365                     planetFilters.add(filterPredicate);
366                 }
367             }
368         } catch (RepositoryException e) {
369             log.error("Problem while retrieving planet feed node filters: " + e.getMessage());
370         }
371 
372         return planetFilters;
373     }
374 
375 }