View Javadoc

1   /**
2    * This file Copyright (c) 2003-2013 Magnolia International
3    * Ltd.  (http://www.magnolia-cms.com). All rights reserved.
4    *
5    *
6    * This file is dual-licensed under both the Magnolia
7    * Network Agreement and the GNU General Public License.
8    * You may elect to use one or the other of these licenses.
9    *
10   * This file is distributed in the hope that it will be
11   * useful, but AS-IS and WITHOUT ANY WARRANTY; without even the
12   * implied warranty of MERCHANTABILITY or FITNESS FOR A
13   * PARTICULAR PURPOSE, TITLE, or NONINFRINGEMENT.
14   * Redistribution, except as permitted by whichever of the GPL
15   * or MNA you select, is prohibited.
16   *
17   * 1. For the GPL license (GPL), you can redistribute and/or
18   * modify this file under the terms of the GNU General
19   * Public License, Version 3, as published by the Free Software
20   * Foundation.  You should have received a copy of the GNU
21   * General Public License, Version 3 along with this program;
22   * if not, write to the Free Software Foundation, Inc., 51
23   * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
24   *
25   * 2. For the Magnolia Network Agreement (MNA), this file
26   * and the accompanying materials are made available under the
27   * terms of the MNA which accompanies this distribution, and
28   * is available at http://www.magnolia-cms.com/mna.html
29   *
30   * Any modifications to this file must keep this entire header
31   * intact.
32   *
33   */
34  package info.magnolia.module.rssaggregator.generator;
35  
36  import info.magnolia.commands.MgnlCommand;
37  import info.magnolia.context.Context;
38  import info.magnolia.context.MgnlContext;
39  import info.magnolia.jcr.util.NodeTypes;
40  import info.magnolia.jcr.util.NodeUtil;
41  import info.magnolia.jcr.util.PropertyUtil;
42  import info.magnolia.jcr.util.VersionUtil;
43  import info.magnolia.module.rssaggregator.RSSAggregatorConstants;
44  import info.magnolia.module.rssaggregator.RSSAggregatorNodeTypes;
45  import info.magnolia.module.rssaggregator.importhandler.FilterPredicate;
46  import info.magnolia.module.rssaggregator.importhandler.FilterPredicateContentMapper;
47  import info.magnolia.module.rssaggregator.importhandler.PlanetFilter;
48  import info.magnolia.module.rssaggregator.util.PlanetUtil;
49  
50  import java.math.BigInteger;
51  import java.security.MessageDigest;
52  import java.util.HashSet;
53  import java.util.List;
54  import java.util.Set;
55  
56  import javax.jcr.Node;
57  import javax.jcr.NodeIterator;
58  import javax.jcr.RepositoryException;
59  import javax.jcr.Session;
60  
61  import org.apache.commons.lang.StringUtils;
62  import org.slf4j.Logger;
63  import org.slf4j.LoggerFactory;
64  
65  import com.sun.syndication.feed.synd.SyndContentImpl;
66  import com.sun.syndication.feed.synd.SyndEntry;
67  import com.sun.syndication.feed.synd.SyndEntryImpl;
68  
69  /**
70   * Generates the data for the planet stuff.
71   *
72   * @author lfischer
73   */
74  public class PlanetDataGenerator extends MgnlCommand {
75  
76      private static final Logger log = LoggerFactory.getLogger(PlanetDataGenerator.class);
77  
78      private static final String PLANET_DATANODE_NAME = "planetData";
79      private static final String POSTS_PREFIX = "posts-";
80      private static final String POSTS_FIRST = POSTS_PREFIX + "00000";
81      private static final String POST_ENTRY_PREFIX = "entry-";
82      private static final int MAX_NODE_ENTRIES = 999;
83  
84      private Session session;
85  
86      @Override
87      public boolean execute(Context context) throws Exception {
88          log.info("Starting command for Planet post archive.");
89          session = MgnlContext.getJCRSession(RSSAggregatorConstants.WORKSPACE);
90          generatePlanetFeedData(session.getRootNode());
91          log.info("Finished generating Planet archive.");
92          return true;
93      }
94  
95      /**
96       * Process all RSS Aggregator feeds and create an archive of feed items if the feed is marked as planet feed.
97       */
98      void generatePlanetFeedData(Node root) {
99          try {
100             NodeIterator feeds = root.getNodes();
101             while (feeds.hasNext()) {
102                 Node feedOrFolderNode = feeds.nextNode();
103                 if (NodeUtil.isNodeType(feedOrFolderNode, RSSAggregatorNodeTypes.RSSAggregator.NAME)) {
104                     doGeneratePlanetDataForRSSAggregator(feedOrFolderNode);
105                 } else if (NodeUtil.isNodeType(feedOrFolderNode, NodeTypes.Folder.NAME)) {
106                     generatePlanetFeedData(feedOrFolderNode);
107                 }
108             }
109         } catch (RepositoryException e) {
110             log.error("Problem while copying feed data for planet: " + e.getMessage());
111         }
112     }
113 
114     private void doGeneratePlanetDataForRSSAggregator(Node feedNode) throws RepositoryException {
115         // only generate feed archive if the feed is marked as planet feed in the data module dialog
116         if (PlanetUtil.isPlanetNode(feedNode)) {
117             log.info("Storing data for planet feed " + feedNode.getName());
118             // create planetDataMode if it not exists
119             Node planetData = NodeUtil.createPath(feedNode, PLANET_DATANODE_NAME, NodeTypes.Content.NAME, true);
120 
121             if (feedNode.hasNode("data")) {
122                 // Get the node with the feeds for Hackergotchi lookup
123                 NodeIterator channels = feedNode.getNode("data").getNodes();
124                 // load filters
125                 Set<FilterPredicate> planetFilters = loadPlanetFilters(feedNode);
126 
127                 while (channels.hasNext()) {
128                     Node channel = channels.nextNode();
129                     processChannelEntries(planetData, channel, planetFilters);
130                 }
131             }
132         } else {
133             log.info("Items of feed " + feedNode.getName() + " will not be archived because the feed is not marked as Planet feed.");
134         }
135     }
136 
137     /**
138      * Iterate through feed data and create items in the planet archive.
139      *
140      * @param planetData Parent node where feed items are stored.
141      * @param channel The current feed channel where current feed data are stored.
142      * @param planetFilters Filter list from data dialog.
143      */
144     void processChannelEntries(Node planetData, Node channel, Set<FilterPredicate> planetFilters) {
145         try {
146             String postsParentNode = getPostsParent(planetData, channel.getNodes().getSize());
147 
148             Node target = NodeUtil.createPath(planetData, postsParentNode, NodeTypes.Content.NAME, true);
149             long entryCount = target.getNodes().getSize();
150             NodeIterator entries = channel.getNodes();
151             while (entries.hasNext()) {
152                 Node entry = entries.nextNode();
153                 entryCount += 1;
154                 createPlanetEntry(target, entry, entryCount, planetFilters);
155             }
156         } catch (RepositoryException e) {
157             log.error("Problem while processing channel entries: " + e.getMessage());
158         }
159     }
160 
161     /**
162      * Store a feed item within the planet archive.
163      *
164      * @param targetPath Parent node of the feed item.
165      * @param srcEntry Original feed item imported by the RSS feed fetcher.
166      * @param currEntry Number of current entry to be stored.
167      * @param planetFilters Filter list from data dialog.
168      */
169     void createPlanetEntry(Node targetPath, Node srcEntry, long currEntry, Set<FilterPredicate> planetFilters) {
170         try {
171             String author = PlanetUtil.formatName(srcEntry, "author");
172             String channelTitle = PlanetUtil.formatName(srcEntry, "channelTitle");
173             String title = PropertyUtil.getString(srcEntry, "title", "");
174             String description;
175             if (StringUtils.isNotBlank(PropertyUtil.getString(srcEntry, "content", ""))) {
176                 description = PropertyUtil.getString(srcEntry, "content", "");
177             } else {
178                 description = PropertyUtil.getString(srcEntry, "description", "");
179             }
180 
181             String link = PropertyUtil.getString(srcEntry, "link", "");
182             Long pubDate = srcEntry.hasProperty("pubDate") ? srcEntry.getProperty("pubDate").getLong() : null;
183 
184             // only store the post entry if all necessary attributes are there
185             if ((StringUtils.isBlank(author) || StringUtils.isBlank(channelTitle)) || StringUtils.isBlank(title)
186                     || StringUtils.isBlank(description) || StringUtils.isBlank(link) || pubDate == null) {
187                 log.error("Found entry with missing mandatory attributes. The post will not be included in the planet archive.");
188             } else {
189                 // build the checksums
190                 String checksum1 = getPostChecksum(author + channelTitle + title + description + link + String.valueOf(pubDate));
191                 String checksum2 = getPostChecksum(author + description);
192 
193                 // sibling to the target path, will also be checked for duplicates
194                 Node targetSibling = NodeUtil.getSiblingBefore(targetPath);
195 
196                 // only add the post if it doesn't exist
197                 if (!postExists(targetPath, checksum1, checksum2) && !postExists(targetSibling, checksum1, checksum2)) {
198                     // get the parent node for more information
199 
200                     if (includePost(planetFilters, srcEntry)) {
201                         Node channelNode = srcEntry.getParent();
202                         while (targetPath.hasNode(POST_ENTRY_PREFIX + currEntry)) {
203                             currEntry++;
204                         }
205                         Node trgEntry = NodeUtil.createPath(targetPath, POST_ENTRY_PREFIX + currEntry, NodeTypes.Content.NAME, true);
206                         PropertyUtil.setProperty(trgEntry, "checksum1", checksum1);
207                         PropertyUtil.setProperty(trgEntry, "checksum2", checksum2);
208                         PropertyUtil.setProperty(trgEntry, "author", author);
209                         PropertyUtil.setProperty(trgEntry, "channelTitle", channelTitle);
210                         PropertyUtil.setProperty(trgEntry, "title", title);
211                         PropertyUtil.setProperty(trgEntry, "description", description);
212                         PropertyUtil.setProperty(trgEntry, "link", link);
213                         PropertyUtil.setProperty(trgEntry, "pubDate", pubDate);
214                         PropertyUtil.setProperty(trgEntry, "authorLink", PropertyUtil.getString(channelNode, "link", ""));
215                         PropertyUtil.setProperty(trgEntry, "rssLink", PropertyUtil.getString(channelNode, "rss", ""));
216                         PropertyUtil.setProperty(trgEntry, "hidden", false);
217 
218                         session.save();
219                     } else {
220                         log.info("Post was not included because filter setting didn't match: " + StringUtils.abbreviate(title, 60));
221                     }
222                 } else {
223                     log.info("Found already existing post: " + StringUtils.abbreviate(title, 60));
224                 }
225             }
226         } catch (RepositoryException e) {
227             log.error("Problem while creating planet entry: " + e.getMessage());
228         }
229     }
230 
231     /**
232      * Determine if a feed item already exists based on checksums of the content.
233      *
234      * @param targetPath Archive node (parent) containing the entries to be checked.
235      * @param check1 First checksum to compare.
236      * @param check2 Second checksum to compare.
237      * @return true, if the entry already exists, otherwise false
238      */
239     boolean postExists(Node targetPath, String check1, String check2) {
240         boolean found = false;
241         if (targetPath != null) {
242             try {
243                 NodeIterator targetEntries = targetPath.getNodes();
244                 while (targetEntries.hasNext()) {
245                     Node existing = targetEntries.nextNode();
246 
247                     String extCheck1 = "";
248                     if (existing.hasProperty("checksum1")) {
249                         extCheck1 = existing.getProperty("checksum1").getString();
250                     }
251                     String extCheck2 = "";
252                     if (existing.hasProperty("checksum2")) {
253                         extCheck2 = existing.getProperty("checksum2").getString();
254                     }
255 
256                     if (StringUtils.equals(extCheck1, check1) || StringUtils.equals(extCheck2, check2)) {
257                         found = true;
258                         break;
259                     }
260                 }
261             } catch (RepositoryException e) {
262                 log.error("Problem while searching for post: " + e.getMessage());
263             }
264         }
265         return found;
266     }
267 
268     boolean includePost(Set<FilterPredicate> planetFilters, Node srcNode) {
269         if (planetFilters != null && planetFilters.size() > 0) {
270 
271             try {
272                 PlanetFilter planetFilter = new PlanetFilter(planetFilters);
273 
274                 SyndEntry entry = new SyndEntryImpl();
275                 String author = PlanetUtil.formatName(srcNode, "author");
276                 if (StringUtils.isNotBlank(author)) {
277                     entry.setAuthor(author);
278                 } else {
279                     entry.setAuthor(PlanetUtil.formatName(srcNode, "channelTitle"));
280                 }
281                 entry.setTitle(PropertyUtil.getString(srcNode, "title"));
282                 SyndContentImpl description = new SyndContentImpl();
283                 description.setValue(PropertyUtil.getString(srcNode, "description", ""));
284                 entry.setDescription(description);
285 
286                 return planetFilter.include(entry);
287             } catch (RepositoryException e) {
288                 log.error("Problem while filtering planet feed content: " + e.getMessage());
289             }
290         }
291 
292         return true;
293     }
294 
295     /**
296      * Create the name for the parent folder of archived post entries.
297      *
298      * This method is used to avoid having to many node entries under one parent node.
299      * It retrieves the newest archive folder and checks if the total number of existing entries under this node
300      * plus the additional ones from the current run will exceed the number of maximum nodes allowed under one node.
301      *
302      * If the calculated number is below the maximum, the latest archive folder will be used. If the number reaches the
303      * maximum, a new node will be created for storage.
304      *
305      * @param pdNode Parent node where planet archive data are stored in
306      * @param entryCount number of entries that need to be added
307      * @return Name of the archive storage node
308      */
309     String getPostsParent(Node pdNode, long entryCount) {
310         String postsFolder = POSTS_FIRST;
311         int archCount;
312         try {
313             // get latest folder with posts
314             archCount = pdNode.getNodes().getSize() > 0 ? (int) (pdNode.getNodes().getSize() - 1) : 0;
315             postsFolder = POSTS_PREFIX + StringUtils.leftPad(String.valueOf(archCount), 5, "0");
316 
317             // check how many entries this node already has
318             Node postsNode = NodeUtil.createPath(pdNode, postsFolder, NodeTypes.Content.NAME, true);
319             long existingEntries = postsNode.getNodes().getSize();
320 
321             // check if we need a new parent node to avoid having to many netries under one parent node
322             if ((existingEntries + entryCount) > MAX_NODE_ENTRIES) {
323                 postsFolder = POSTS_PREFIX + StringUtils.leftPad(String.valueOf(archCount + 1), 5, "0");
324             }
325         } catch (RepositoryException e) {
326             log.error("Problem while getting number of highest posts node: " + e.getMessage());
327         }
328         return postsFolder;
329     }
330 
331     /**
332      * Create a checksum from a String input.
333      *
334      * @param postContent String content as checkusm input
335      * @return MD5 checksum
336      */
337     String getPostChecksum(String postContent) {
338         String checksum = null;
339         MessageDigest md;
340 
341         try {
342             md = MessageDigest.getInstance("MD5");
343             md.reset();
344             md.update(postContent.getBytes(), 0, postContent.length());
345             checksum = new BigInteger(1, md.digest()).toString(16);
346         } catch (Exception e) {
347             log.error("Problem while creating checksum for post: " + e.getMessage());
348         }
349         return checksum;
350     }
351 
352     Set<FilterPredicate> loadPlanetFilters(Node feedNode) {
353         Set<FilterPredicate> planetFilters = new HashSet<FilterPredicate>();
354 
355         try {
356             Node filtersNode = feedNode.hasNode("filters") ? feedNode.getNode("filters") : null;
357 
358             if (filtersNode != null) {
359                 FilterPredicateContentMapper filterPredicateMapper = new FilterPredicateContentMapper();
360                 List<Node> filterNodes = NodeUtil.asList(NodeUtil.getNodes(filtersNode, VersionUtil.getNodeTypeName(filtersNode)));
361 
362                 for (Node n : filterNodes) {
363                     FilterPredicate filterPredicate = filterPredicateMapper.map(n);
364                     if (filterPredicate == null) {
365                         continue;
366                     }
367                     planetFilters.add(filterPredicate);
368                 }
369             }
370         } catch (RepositoryException e) {
371             log.error("Problem while retrieving planet feed node filters: " + e.getMessage());
372         }
373 
374         return planetFilters;
375     }
376 
377 }