View Javadoc

1   /**
2    * This file Copyright (c) 2013 Magnolia International
3    * Ltd.  (http://www.magnolia-cms.com). All rights reserved.
4    *
5    *
6    * This file is dual-licensed under both the Magnolia
7    * Network Agreement and the GNU General Public License.
8    * You may elect to use one or the other of these licenses.
9    *
10   * This file is distributed in the hope that it will be
11   * useful, but AS-IS and WITHOUT ANY WARRANTY; without even the
12   * implied warranty of MERCHANTABILITY or FITNESS FOR A
13   * PARTICULAR PURPOSE, TITLE, or NONINFRINGEMENT.
14   * Redistribution, except as permitted by whichever of the GPL
15   * or MNA you select, is prohibited.
16   *
17   * 1. For the GPL license (GPL), you can redistribute and/or
18   * modify this file under the terms of the GNU General
19   * Public License, Version 3, as published by the Free Software
20   * Foundation.  You should have received a copy of the GNU
21   * General Public License, Version 3 along with this program;
22   * if not, write to the Free Software Foundation, Inc., 51
23   * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
24   *
25   * 2. For the Magnolia Network Agreement (MNA), this file
26   * and the accompanying materials are made available under the
27   * terms of the MNA which accompanies this distribution, and
28   * is available at http://www.magnolia-cms.com/mna.html
29   *
30   * Any modifications to this file must keep this entire header
31   * intact.
32   *
33   */
34  package info.magnolia.module.rssaggregator.generator;
35  
36  import info.magnolia.cms.util.QueryUtil;
37  import info.magnolia.commands.MgnlCommand;
38  import info.magnolia.context.Context;
39  import info.magnolia.context.MgnlContext;
40  import info.magnolia.jcr.util.NodeUtil;
41  import info.magnolia.jcr.util.PropertyUtil;
42  import info.magnolia.jcr.util.VersionUtil;
43  import info.magnolia.module.data.DataConsts;
44  import info.magnolia.module.rssaggregator.importhandler.FilterPredicate;
45  import info.magnolia.module.rssaggregator.importhandler.FilterPredicateContentMapper;
46  import info.magnolia.module.rssaggregator.importhandler.PlanetFilter;
47  import info.magnolia.module.rssaggregator.util.PlanetUtil;
48  
49  import java.math.BigInteger;
50  import java.security.MessageDigest;
51  import java.util.HashSet;
52  import java.util.Iterator;
53  import java.util.List;
54  import java.util.Set;
55  
56  import javax.jcr.Node;
57  import javax.jcr.NodeIterator;
58  import javax.jcr.RepositoryException;
59  import javax.jcr.Session;
60  
61  import org.apache.commons.lang.StringUtils;
62  import org.slf4j.Logger;
63  import org.slf4j.LoggerFactory;
64  
65  import com.sun.syndication.feed.synd.SyndContentImpl;
66  import com.sun.syndication.feed.synd.SyndEntry;
67  import com.sun.syndication.feed.synd.SyndEntryImpl;
68  
69  /**
70   * Generates the data for the planet stuff.
71   * 
72   * @author lfischer
73   */
74  public class PlanetDataGenerator extends MgnlCommand {
75  
76      private static final Logger log = LoggerFactory.getLogger(PlanetDataGenerator.class);
77  
78      private static final String PLANET_DATANODE_NAME = "planetData";
79      private static final String CONTENTTYPE_RSSAGGREGATOR = "RssAggregator";
80      private static final String POSTS_PREFIX = "posts-";
81      private static final String POSTS_FIRST = POSTS_PREFIX + "00000";
82      private static final String POST_ENTRY_PREFIX = "entry-";
83      private static final int MAX_NODE_ENTRIES = 999;
84  
85      private Session session;
86  
87      @Override
88      public boolean execute(Context context) throws Exception {
89          log.info("Starting command for Planet post archive.");
90  
91          // session = context.getJCRSession("data");
92          session = MgnlContext.getSystemContext().getJCRSession("data");
93  
94          generatePlanetFeedData();
95  
96          log.info("Finished generating Planet archive.");
97  
98          return true;
99      }
100 
101     /**
102      * Process all RSS Aggregator feeds and create an archive of feed items if the feed is marked as planet feed.
103      */
104     void generatePlanetFeedData() {
105         try {
106             Iterator<Node> feeds = NodeUtil.getNodes(session.getNode("/rssaggregator")).iterator();
107 
108             while (feeds.hasNext()) {
109                 Node feedNode = feeds.next();
110                 if (NodeUtil.isNodeType(feedNode, CONTENTTYPE_RSSAGGREGATOR)) {
111                     // only generate feed archive if the feed is marked as planet feed in the data module dialog
112                     if (PlanetUtil.isPlanetNode(feedNode)) {
113                         log.info("Storing data for planet feed " + feedNode.getName());
114                         // create planetDataMode if it not exists
115                         Node planetData = NodeUtil.createPath(feedNode, PLANET_DATANODE_NAME, DataConsts.MODULE_DATA_CONTENT_NODE_TYPE, true);
116 
117                         if (feedNode.hasNode("data")) {
118                             // Get the node with the feeds for Hackergotchi lookup
119                             Iterator<Node> channels = NodeUtil.getNodes(feedNode.getNode("data")).iterator();
120                             // load filters
121                             Set<FilterPredicate> planetFilters = loadPlanetFilters(feedNode);
122 
123                             while (channels.hasNext()) {
124                                 Node channel = channels.next();
125                                 processChannelEntries(planetData, channel, planetFilters);
126                             }
127                         }
128                     }
129                 }
130             }
131         } catch (RepositoryException e) {
132             log.error("Problem while copying feed data for planet: " + e.getMessage());
133         }
134     }
135 
136     /**
137      * Iterate through feed data and create items in the planet archive.
138      * 
139      * @param planetData Parent node where feed items are stored.
140      * @param channel The current feed channel where current feed data are stored.
141      * @param planetFilters Filter list from data dialog.
142      */
143     void processChannelEntries(Node planetData, Node channel, Set<FilterPredicate> planetFilters) {
144         try {
145             String postsParentNode = getPostsParent(planetData, NodeUtil.asList(NodeUtil.getNodes(channel)).size());
146 
147             Node target = NodeUtil.createPath(planetData, postsParentNode, DataConsts.MODULE_DATA_CONTENT_NODE_TYPE, true);
148             long entryCount = NodeUtil.asList(NodeUtil.getNodes(target)).size();
149             Iterator<Node> entries = NodeUtil.getNodes(channel).iterator();
150             while (entries.hasNext()) {
151                 Node entry = entries.next();
152                 entryCount += 1;
153                 createPlanetEntry(target, entry, entryCount, planetFilters);
154             }
155         } catch (RepositoryException e) {
156             log.error("Problem while processing channel entries: " + e.getMessage());
157         }
158     }
159 
160     /**
161      * Store a feed item within the planet archive.
162      * 
163      * @param targetPath Parent node of the feed item.
164      * @param srcEntry Original feed item imported by the RSS feed fetcher.
165      * @param currEntry Number of current entry to be stored.
166      * @param planetFilters Filter list from data dialog.
167      */
168     void createPlanetEntry(Node targetPath, Node srcEntry, long currEntry, Set<FilterPredicate> planetFilters) {
169         try {
170             String author = PlanetUtil.formatName(srcEntry, "author");
171             String channelTitle = PlanetUtil.formatName(srcEntry, "channelTitle");
172             String title = PropertyUtil.getString(srcEntry, "title", "");
173             String description;
174             if (StringUtils.isNotBlank(PropertyUtil.getString(srcEntry, "content", ""))) {
175                 description = PropertyUtil.getString(srcEntry, "content", "");
176             } else {
177                 description = PropertyUtil.getString(srcEntry, "description", "");
178             }
179 
180             String link = PropertyUtil.getString(srcEntry, "link", "");
181             Long pubDate = srcEntry.hasProperty("pubDate") ? srcEntry.getProperty("pubDate").getLong() : null;
182 
183             // only store the post entry if all necessary attributes are there
184             if (StringUtils.isNotBlank(author) && StringUtils.isNotBlank(channelTitle) && StringUtils.isNotBlank(title)
185                     && StringUtils.isNotBlank(description) && StringUtils.isNotBlank(link) && pubDate != null) {
186                 // build the checksums
187                 String checksum1 = getPostChecksum(author + channelTitle + title + description + link + String.valueOf(pubDate));
188                 String checksum2 = getPostChecksum(description);
189 
190                 // only add the post if it doesn't exist
191                 if (!postExists(targetPath, link, checksum1, checksum2)) {
192                     if (includePost(planetFilters, srcEntry)) {
193                         Node channelNode = srcEntry.getParent();
194                         while (targetPath.hasNode(POST_ENTRY_PREFIX + currEntry)) {
195                             currEntry++;
196                         }
197                         Node trgEntry = NodeUtil.createPath(targetPath, POST_ENTRY_PREFIX + currEntry, DataConsts.MODULE_DATA_CONTENT_NODE_TYPE, true);
198                         PropertyUtil.setProperty(trgEntry, "checksum1", checksum1);
199                         PropertyUtil.setProperty(trgEntry, "checksum2", checksum2);
200                         PropertyUtil.setProperty(trgEntry, "author", author);
201                         PropertyUtil.setProperty(trgEntry, "channelTitle", channelTitle);
202                         PropertyUtil.setProperty(trgEntry, "title", title);
203                         PropertyUtil.setProperty(trgEntry, "description", description);
204                         PropertyUtil.setProperty(trgEntry, "link", link);
205                         PropertyUtil.setProperty(trgEntry, "pubDate", pubDate);
206                         PropertyUtil.setProperty(trgEntry, "authorLink", PropertyUtil.getString(channelNode, "link", ""));
207                         PropertyUtil.setProperty(trgEntry, "rssLink", PropertyUtil.getString(channelNode, "rss", ""));
208                         PropertyUtil.setProperty(trgEntry, "hidden", false);
209                         log.info("Added new blog post: " + StringUtils.abbreviate(title, 60));
210 
211                         session.save();
212                     }
213                 }
214             }
215         } catch (RepositoryException e) {
216             log.error("Problem while creating planet entry: " + e.getMessage());
217         }
218     }
219 
220     /**
221      * Determine if a feed item already exists based on checksums of the content.
222      * 
223      * @param targetPath Archive node (parent) containing the entries to be checked.
224      * @param link Link to the blog post for comparison.
225      * @param check1 First checksum to compare (all post attributes).
226      * @param check2 Second checksum to compare (post description only).
227      * @return true, if the entry already exists, otherwise false
228      */
229     boolean postExists(Node targetPath, String link, String check1, String check2) {
230         boolean found = false;
231         if (targetPath != null) {
232             try {
233                 Node planetDataNode = targetPath.getParent();
234                 if (planetDataNode != null) {
235                     String sql = "select * from [dataItemNode] as t where ISDESCENDANTNODE([" + planetDataNode.getPath() + "]) and (t.link='" + link + "'" +
236                             " or t.checksum1='" + check1 + "' or t.checksum2='" + check2 + "')";
237 
238                     NodeIterator posts = QueryUtil.search("data", sql);
239                     if (posts.hasNext() && posts.nextNode() != null) {
240                         found = true;
241                     }
242                 }
243             } catch (RepositoryException e) {
244                 log.error("Problem while searching for post: " + e.getMessage());
245             }
246         }
247         return found;
248     }
249 
250     boolean includePost(Set<FilterPredicate> planetFilters, Node srcNode) {
251         if (planetFilters != null && planetFilters.size() > 0) {
252 
253             try {
254                 PlanetFilter planetFilter = new PlanetFilter(planetFilters);
255 
256                 SyndEntry entry = new SyndEntryImpl();
257                 String author = PlanetUtil.formatName(srcNode, "author");
258                 if (StringUtils.isNotBlank(author)) {
259                     entry.setAuthor(author);
260                 } else {
261                     entry.setAuthor(PlanetUtil.formatName(srcNode, "channelTitle"));
262                 }
263                 entry.setTitle(PropertyUtil.getString(srcNode, "title"));
264                 SyndContentImpl description = new SyndContentImpl();
265                 description.setValue(PropertyUtil.getString(srcNode, "description", ""));
266                 entry.setDescription(description);
267 
268                 return planetFilter.include(entry);
269             } catch (RepositoryException e) {
270                 log.error("Problem while filtering planet feed content: " + e.getMessage());
271             }
272         }
273 
274         return true;
275     }
276 
277     /**
278      * Create the name for the parent folder of archived post entries.
279      * This method is used to avoid having to many node entries under one parent node.
280      * It retrieves the newest archive folder and checks if the total number of existing entries under this node
281      * plus the additonal ones from the current run will exceed the number of maximum nodes allowed under one node.
282      * If the calculated number is below the maximum, the latest archive folder will be used. If the number reaches the
283      * maximum, a new node will be created for storage.
284      * 
285      * @param pdNode Parent node where planet archive data are stored in
286      * @param entryCount number of entries that need to be added
287      * @return Name of the archive storage node
288      */
289     String getPostsParent(Node pdNode, long entryCount) {
290         String postsFolder = POSTS_FIRST;
291         int archCount;
292         try {
293             // determine existing archive nodes without the metadata node
294             archCount = NodeUtil.asList(NodeUtil.getNodes(pdNode)).size() > 0 ? (int) (NodeUtil.asList(NodeUtil.getNodes(pdNode)).size() - 1) : 0;
295             postsFolder = POSTS_PREFIX + StringUtils.leftPad(String.valueOf(archCount), 5, "0");
296 
297             // check how many entries this node already has
298             Node postsNode = NodeUtil.createPath(pdNode, postsFolder, DataConsts.MODULE_DATA_CONTENT_NODE_TYPE, true);
299             long existingEntries = NodeUtil.asList(NodeUtil.getNodes(postsNode)).size();
300 
301             // check if we need a new parent node to avoid having to many netries under one parent node
302             if (existingEntries + entryCount > MAX_NODE_ENTRIES) {
303                 postsFolder = POSTS_PREFIX + StringUtils.leftPad(String.valueOf(archCount + 1), 5, "0");
304             }
305         } catch (RepositoryException e) {
306             log.error("Problem while getting number of highest posts node: " + e.getMessage());
307         }
308         return postsFolder;
309     }
310 
311     /**
312      * Create a checksum from a String input.
313      * 
314      * @param postContent String content as checkusm input
315      * @return MD5 checksum
316      */
317     String getPostChecksum(String postContent) {
318         String checksum = null;
319         MessageDigest md;
320 
321         try {
322             md = MessageDigest.getInstance("MD5");
323             md.reset();
324             md.update(postContent.getBytes(), 0, postContent.length());
325             checksum = new BigInteger(1, md.digest()).toString(16);
326         } catch (Exception e) {
327             log.error("Problem while creating checksum for post: " + e.getMessage());
328         }
329         return checksum;
330     }
331 
332     Set<FilterPredicate> loadPlanetFilters(Node feedNode) {
333         Set<FilterPredicate> planetFilters = new HashSet<FilterPredicate>();
334 
335         try {
336             Node filtersNode = feedNode.hasNode("filters") ? feedNode.getNode("filters") : null;
337 
338             if (filtersNode != null) {
339                 FilterPredicateContentMapper filterPredicateMapper = new FilterPredicateContentMapper();
340                 List<Node> filterNodes = NodeUtil.asList(NodeUtil.getNodes(filtersNode, VersionUtil.getNodeTypeName(filtersNode)));
341 
342                 for (Node n : filterNodes) {
343                     FilterPredicate filterPredicate = filterPredicateMapper.map(n);
344                     if (filterPredicate == null) {
345                         continue;
346                     }
347                     planetFilters.add(filterPredicate);
348                 }
349             }
350         } catch (RepositoryException e) {
351             log.error("Problem while retrieving planet feed node filters: " + e.getMessage());
352         }
353 
354         return planetFilters;
355     }
356 
357 }