View Javadoc

1   /**
2    * This file Copyright (c) 2003-2013 Magnolia International
3    * Ltd.  (http://www.magnolia-cms.com). All rights reserved.
4    *
5    *
6    * This file is dual-licensed under both the Magnolia
7    * Network Agreement and the GNU General Public License.
8    * You may elect to use one or the other of these licenses.
9    *
10   * This file is distributed in the hope that it will be
11   * useful, but AS-IS and WITHOUT ANY WARRANTY; without even the
12   * implied warranty of MERCHANTABILITY or FITNESS FOR A
13   * PARTICULAR PURPOSE, TITLE, or NONINFRINGEMENT.
14   * Redistribution, except as permitted by whichever of the GPL
15   * or MNA you select, is prohibited.
16   *
17   * 1. For the GPL license (GPL), you can redistribute and/or
18   * modify this file under the terms of the GNU General
19   * Public License, Version 3, as published by the Free Software
20   * Foundation.  You should have received a copy of the GNU
21   * General Public License, Version 3 along with this program;
22   * if not, write to the Free Software Foundation, Inc., 51
23   * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
24   *
25   * 2. For the Magnolia Network Agreement (MNA), this file
26   * and the accompanying materials are made available under the
27   * terms of the MNA which accompanies this distribution, and
28   * is available at http://www.magnolia-cms.com/mna.html
29   *
30   * Any modifications to this file must keep this entire header
31   * intact.
32   *
33   */
34  package info.magnolia.module.rssaggregator.generator;
35  
36  import info.magnolia.commands.MgnlCommand;
37  import info.magnolia.context.Context;
38  import info.magnolia.context.MgnlContext;
39  import info.magnolia.jcr.util.NodeUtil;
40  import info.magnolia.jcr.util.PropertyUtil;
41  import info.magnolia.jcr.util.VersionUtil;
42  import info.magnolia.module.data.DataConsts;
43  import info.magnolia.module.rssaggregator.importhandler.FilterPredicate;
44  import info.magnolia.module.rssaggregator.importhandler.FilterPredicateContentMapper;
45  import info.magnolia.module.rssaggregator.importhandler.PlanetFilter;
46  import info.magnolia.module.rssaggregator.util.PlanetUtil;
47  
48  import java.math.BigInteger;
49  import java.security.MessageDigest;
50  import java.util.HashSet;
51  import java.util.List;
52  import java.util.Set;
53  
54  import javax.jcr.Node;
55  import javax.jcr.NodeIterator;
56  import javax.jcr.RepositoryException;
57  import javax.jcr.Session;
58  
59  import org.apache.commons.lang.StringUtils;
60  import org.slf4j.Logger;
61  import org.slf4j.LoggerFactory;
62  
63  import com.sun.syndication.feed.synd.SyndContentImpl;
64  import com.sun.syndication.feed.synd.SyndEntry;
65  import com.sun.syndication.feed.synd.SyndEntryImpl;
66  
67  /**
68   * Generates the data for the planet stuff.
69   *
70   * @author lfischer
71   */
72  public class PlanetDataGenerator extends MgnlCommand {
73  
74      private static final Logger log = LoggerFactory.getLogger(PlanetDataGenerator.class);
75  
76      private static final String PLANET_DATANODE_NAME = "planetData";
77      private static final String CONTENTTYPE_RSSAGGREGATOR = "RssAggregator";
78      private static final String POSTS_PREFIX = "posts-";
79      private static final String POSTS_FIRST = POSTS_PREFIX + "00000";
80      private static final String POST_ENTRY_PREFIX = "entry-";
81      private static final int MAX_NODE_ENTRIES = 999;
82  
83      private Session session;
84  
85      @Override
86      public boolean execute(Context context) throws Exception {
87          log.info("Starting command for Planet post archive.");
88  
89          // session = context.getJCRSession("data");
90          session = MgnlContext.getSystemContext().getJCRSession("data");
91  
92          generatePlanetFeedData();
93  
94          log.info("Finished generating Planet archive.");
95  
96          return true;
97      }
98  
99      /**
100      * Process all RSS Aggregator feeds and create an archive of feed items if the feed is marked as planet feed.
101      */
102     void generatePlanetFeedData() {
103         try {
104             //NodeIterator feeds = SessionUtil.getNode("data", "/rssaggregator").getNodes();
105             NodeIterator feeds = session.getNode("/rssaggregator").getNodes();
106 
107             while (feeds.hasNext()) {
108                 Node feedNode = feeds.nextNode();
109                 if (NodeUtil.isNodeType(feedNode, CONTENTTYPE_RSSAGGREGATOR)) {
110                     // only generate feed archive if the feed is marked as planet feed in the data module dialog
111                     if (PlanetUtil.isPlanetNode(feedNode)) {
112                         log.info("Storing data for planet feed " + feedNode.getName());
113                         // create planetDataMode if it not exists
114                         Node planetData = NodeUtil.createPath(feedNode, PLANET_DATANODE_NAME, DataConsts.MODULE_DATA_CONTENT_NODE_TYPE, true);
115 
116                         if (feedNode.hasNode("data")) {
117                             // Get the node with the feeds for Hackergotchi lookup
118                             NodeIterator channels = feedNode.getNode("data").getNodes();
119                             // load filters
120                             Set<FilterPredicate> planetFilters = loadPlanetFilters(feedNode);
121 
122                             while (channels.hasNext()) {
123                                 Node channel = channels.nextNode();
124                                 processChannelEntries(planetData, channel, planetFilters);
125                             }
126                         }
127                     } else {
128                         log.info("Items of feed " + feedNode.getName() + " will not be archived because the feed is not marked as Planet feed.");
129                     }
130                 }
131             }
132         } catch (RepositoryException e) {
133             log.error("Problem while copying feed data for planet: " + e.getMessage());
134         }
135     }
136 
137     /**
138      * Iterate through feed data and create items in the planet archive.
139      *
140      * @param planetData Parent node where feed items are stored.
141      * @param channel The current feed channel where current feed data are stored.
142      * @param planetFilters Filter list from data dialog.
143      */
144     void processChannelEntries(Node planetData, Node channel, Set<FilterPredicate> planetFilters) {
145         try {
146             String postsParentNode = getPostsParent(planetData, channel.getNodes().getSize());
147 
148             Node target = NodeUtil.createPath(planetData, postsParentNode, DataConsts.MODULE_DATA_CONTENT_NODE_TYPE, true);
149             long entryCount = target.getNodes().getSize();
150             NodeIterator entries = channel.getNodes();
151             while (entries.hasNext()) {
152                 Node entry = entries.nextNode();
153                 entryCount += 1;
154                 createPlanetEntry(target, entry, entryCount, planetFilters);
155             }
156         } catch (RepositoryException e) {
157             log.error("Problem while processing channel entries: " + e.getMessage());
158         }
159     }
160 
161     /**
162      * Store a feed item within the planet archive.
163      *
164      * @param targetPath Parent node of the feed item.
165      * @param srcEntry Original feed item imported by the RSS feed fetcher.
166      * @param currEntry Number of current entry to be stored.
167      * @param planetFilters Filter list from data dialog.
168      */
169     void createPlanetEntry(Node targetPath, Node srcEntry, long currEntry, Set<FilterPredicate> planetFilters) {
170         try {
171             String author = PlanetUtil.formatName(srcEntry, "author");
172             String channelTitle = PlanetUtil.formatName(srcEntry, "channelTitle");
173             String title = PropertyUtil.getString(srcEntry, "title", "");
174             String description = PropertyUtil.getString(srcEntry, "description", "");
175 
176             String link = PropertyUtil.getString(srcEntry, "link", "");
177             Long pubDate = srcEntry.hasProperty("pubDate") ? srcEntry.getProperty("pubDate").getLong() : null;
178 
179             // only store the post entry if all necessary attributes are there
180             if ((StringUtils.isBlank(author) || StringUtils.isBlank(channelTitle)) || StringUtils.isBlank(title)
181                     || StringUtils.isBlank(description) || StringUtils.isBlank(link) || pubDate == null) {
182                 log.error("Found entry with missing mandatory attributes. The post will not be included in the planet archive.");
183             } else {
184                 // build the checksums
185                 String checksum1 = getPostChecksum(author + channelTitle + title + description + link + String.valueOf(pubDate));
186                 String checksum2 = getPostChecksum(author + description);
187 
188                 // sibling to the target path, will also be checked for duplicates
189                 Node targetSibling = NodeUtil.getSiblingBefore(targetPath);
190 
191                 // only add the post if it doesn't exist
192                 if (!postExists(targetPath, checksum1, checksum2) && !postExists(targetSibling, checksum1, checksum2)) {
193                     // get the parent node for more information
194 
195                     if (includePost(planetFilters, srcEntry)) {
196                         Node channelNode = srcEntry.getParent();
197 
198                         Node trgEntry = NodeUtil.createPath(targetPath, POST_ENTRY_PREFIX + currEntry, DataConsts.MODULE_DATA_CONTENT_NODE_TYPE, true);
199                         PropertyUtil.setProperty(trgEntry, "checksum1", checksum1);
200                         PropertyUtil.setProperty(trgEntry, "checksum2", checksum2);
201                         PropertyUtil.setProperty(trgEntry, "author", author);
202                         PropertyUtil.setProperty(trgEntry, "channelTitle", channelTitle);
203                         PropertyUtil.setProperty(trgEntry, "title", title);
204                         PropertyUtil.setProperty(trgEntry, "description", description);
205                         PropertyUtil.setProperty(trgEntry, "link", link);
206                         PropertyUtil.setProperty(trgEntry, "pubDate", pubDate);
207                         PropertyUtil.setProperty(trgEntry, "authorLink", PropertyUtil.getString(channelNode, "link", ""));
208                         PropertyUtil.setProperty(trgEntry, "rssLink", PropertyUtil.getString(channelNode, "rss", ""));
209                         PropertyUtil.setProperty(trgEntry, "hidden", false);
210 
211                         session.save();
212                     } else {
213                         log.info("Post was not included because filter setting didn't match: " + StringUtils.abbreviate(title, 60));
214                     }
215                 } else {
216                     log.info("Found already existing post: " + StringUtils.abbreviate(title, 60));
217                 }
218             }
219         } catch (RepositoryException e) {
220             log.error("Problem while creating planet entry: " + e.getMessage());
221         }
222     }
223 
224     /**
225      * Determine if a feed item already exists based on checksums of the content.
226      *
227      * @param targetPath Archive node (parent) containing the entries to be checked.
228      * @param check1 First checksum to compare.
229      * @param check2 Second checksum to compare.
230      * @return true, if the entry already exists, otherwise false
231      */
232     boolean postExists(Node targetPath, String check1, String check2) {
233         boolean found = false;
234         if (targetPath != null) {
235             try {
236                 NodeIterator targetEntries = targetPath.getNodes();
237                 while (targetEntries.hasNext()) {
238                     Node existing = targetEntries.nextNode();
239 
240                     String extCheck1 = "";
241                     if (existing.hasProperty("checksum1")) {
242                         extCheck1 = existing.getProperty("checksum1").getString();
243                     }
244                     String extCheck2 = "";
245                     if (existing.hasProperty("checksum2")) {
246                         extCheck2 = existing.getProperty("checksum2").getString();
247                     }
248 
249                     if (StringUtils.equals(extCheck1, check1) || StringUtils.equals(extCheck2, check2)) {
250                         found = true;
251                         break;
252                     }
253                 }
254             } catch (RepositoryException e) {
255                 log.error("Problem while searching for post: " + e.getMessage());
256             }
257         }
258         return found;
259     }
260 
261     boolean includePost(Set<FilterPredicate> planetFilters, Node srcNode) {
262         if (planetFilters != null && planetFilters.size() > 0) {
263 
264             try {
265                 PlanetFilter planetFilter = new PlanetFilter(planetFilters);
266 
267                 SyndEntry entry = new SyndEntryImpl();
268                 String author = PlanetUtil.formatName(srcNode, "author");
269                 if (StringUtils.isNotBlank(author)) {
270                     entry.setAuthor(author);
271                 } else {
272                     entry.setAuthor(PlanetUtil.formatName(srcNode, "channelTitle"));
273                 }
274                 entry.setTitle(PropertyUtil.getString(srcNode, "title"));
275                 SyndContentImpl description = new SyndContentImpl();
276                 description.setValue(PropertyUtil.getString(srcNode, "description", ""));
277                 entry.setDescription(description);
278 
279                 return planetFilter.include(entry);
280             } catch (RepositoryException e) {
281                 log.error("Problem while filtering planet feed content: " + e.getMessage());
282             }
283         }
284 
285         return true;
286     }
287 
288     /**
289      * Create the name for the parent folder of archived post entries.
290      *
291      * This method is used to avoid having to many node entries under one parent node.
292      * It retrieves the newest archive folder and checks if the total number of existing entries under this node
293      * plus the additonal ones from the current run will exceed the number of maximum nodes allowed under one node.
294      *
295      * If the calculated number is below the maximum, the latest archive folder will be used. If the number reaches the
296      * maximum, a new node will be created for storage.
297      *
298      * @param pdNode Parent node where planet archive data are stored in
299      * @param entryCount number of entries that need to be added
300      * @return Name of the archive storage node
301      */
302     String getPostsParent(Node pdNode, long entryCount) {
303         String postsFolder = POSTS_FIRST;
304         int archCount;
305         try {
306             // get latest folder with posts
307             archCount = pdNode.getNodes().getSize() > 0 ? (int) (pdNode.getNodes().getSize() - 1) : 0;
308             postsFolder = POSTS_PREFIX + StringUtils.leftPad(String.valueOf(archCount), 5, "0");
309 
310             // check how many entries this node already has
311             Node postsNode = NodeUtil.createPath(pdNode, postsFolder, DataConsts.MODULE_DATA_CONTENT_NODE_TYPE, true);
312             long existingEntries = postsNode.getNodes().getSize();
313 
314             // check if we need a new parent node to avoid having to many netries under one parent node
315             if ((existingEntries + entryCount) > MAX_NODE_ENTRIES) {
316                 postsFolder = POSTS_PREFIX + StringUtils.leftPad(String.valueOf(archCount + 1), 5, "0");
317             }
318         } catch (RepositoryException e) {
319             log.error("Problem while getting number of highest posts node: " + e.getMessage());
320         }
321         return postsFolder;
322     }
323 
324     /**
325      * Create a checksum from a String input.
326      *
327      * @param postContent String content as checkusm input
328      * @return MD5 checksum
329      */
330     String getPostChecksum(String postContent) {
331         String checksum = null;
332         MessageDigest md;
333 
334         try {
335             md = MessageDigest.getInstance("MD5");
336             md.reset();
337             md.update(postContent.getBytes(), 0, postContent.length());
338             checksum = new BigInteger(1, md.digest()).toString(16);
339         } catch (Exception e) {
340             log.error("Problem while creating checksum for post: " + e.getMessage());
341         }
342         return checksum;
343     }
344 
345     Set<FilterPredicate> loadPlanetFilters(Node feedNode) {
346         Set<FilterPredicate> planetFilters = new HashSet<FilterPredicate>();
347 
348         try {
349             Node filtersNode = feedNode.hasNode("filters") ? feedNode.getNode("filters") : null;
350 
351             if (filtersNode != null) {
352                 FilterPredicateContentMapper filterPredicateMapper = new FilterPredicateContentMapper();
353                 List<Node> filterNodes = NodeUtil.asList(NodeUtil.getNodes(filtersNode, VersionUtil.getNodeTypeName(filtersNode)));
354 
355                 for (Node n : filterNodes) {
356                     FilterPredicate filterPredicate = filterPredicateMapper.map(n);
357                     if (filterPredicate == null) {
358                         continue;
359                     }
360                     planetFilters.add(filterPredicate);
361                 }
362             }
363         } catch (RepositoryException e) {
364             log.error("Problem while retrieving planet feed node filters: " + e.getMessage());
365         }
366 
367         return planetFilters;
368     }
369 
370 }