View Javadoc

1   /**
2    * This file Copyright (c) 2013 Magnolia International
3    * Ltd.  (http://www.magnolia-cms.com). All rights reserved.
4    *
5    *
6    * This file is dual-licensed under both the Magnolia
7    * Network Agreement and the GNU General Public License.
8    * You may elect to use one or the other of these licenses.
9    *
10   * This file is distributed in the hope that it will be
11   * useful, but AS-IS and WITHOUT ANY WARRANTY; without even the
12   * implied warranty of MERCHANTABILITY or FITNESS FOR A
13   * PARTICULAR PURPOSE, TITLE, or NONINFRINGEMENT.
14   * Redistribution, except as permitted by whichever of the GPL
15   * or MNA you select, is prohibited.
16   *
17   * 1. For the GPL license (GPL), you can redistribute and/or
18   * modify this file under the terms of the GNU General
19   * Public License, Version 3, as published by the Free Software
20   * Foundation.  You should have received a copy of the GNU
21   * General Public License, Version 3 along with this program;
22   * if not, write to the Free Software Foundation, Inc., 51
23   * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
24   *
25   * 2. For the Magnolia Network Agreement (MNA), this file
26   * and the accompanying materials are made available under the
27   * terms of the MNA which accompanies this distribution, and
28   * is available at http://www.magnolia-cms.com/mna.html
29   *
30   * Any modifications to this file must keep this entire header
31   * intact.
32   *
33   */
34  package info.magnolia.module.rssaggregator.generator;
35  
36  import info.magnolia.cms.util.QueryUtil;
37  import info.magnolia.commands.MgnlCommand;
38  import info.magnolia.context.Context;
39  import info.magnolia.context.MgnlContext;
40  import info.magnolia.jcr.util.NodeTypes;
41  import info.magnolia.jcr.util.NodeUtil;
42  import info.magnolia.jcr.util.PropertyUtil;
43  import info.magnolia.jcr.util.VersionUtil;
44  import info.magnolia.module.rssaggregator.RSSAggregatorConstants;
45  import info.magnolia.module.rssaggregator.RSSAggregatorNodeTypes;
46  import info.magnolia.module.rssaggregator.importhandler.FilterPredicate;
47  import info.magnolia.module.rssaggregator.importhandler.FilterPredicateContentMapper;
48  import info.magnolia.module.rssaggregator.importhandler.PlanetFilter;
49  import info.magnolia.module.rssaggregator.util.PlanetUtil;
50  
51  import java.math.BigInteger;
52  import java.security.MessageDigest;
53  import java.util.HashSet;
54  import java.util.List;
55  import java.util.Set;
56  
57  import javax.jcr.Node;
58  import javax.jcr.NodeIterator;
59  import javax.jcr.RepositoryException;
60  import javax.jcr.Session;
61  
62  import org.apache.commons.lang.StringUtils;
63  import org.slf4j.Logger;
64  import org.slf4j.LoggerFactory;
65  
66  import com.sun.syndication.feed.synd.SyndContentImpl;
67  import com.sun.syndication.feed.synd.SyndEntry;
68  import com.sun.syndication.feed.synd.SyndEntryImpl;
69  
70  /**
71   * Generates the data for the planet stuff.
72   *
73   * @author lfischer
74   */
75  public class PlanetDataGenerator extends MgnlCommand {
76  
77      private static final Logger log = LoggerFactory.getLogger(PlanetDataGenerator.class);
78  
79      private static final String PLANET_DATANODE_NAME = "planetData";
80      private static final String POSTS_PREFIX = "posts-";
81      private static final String POSTS_FIRST = POSTS_PREFIX + "00000";
82      private static final String POST_ENTRY_PREFIX = "entry-";
83      private static final int MAX_NODE_ENTRIES = 999;
84  
85      private Session session;
86  
87      @Override
88      public boolean execute(Context context) throws Exception {
89          log.info("Starting command for Planet post archive.");
90          session = MgnlContext.getJCRSession(RSSAggregatorConstants.WORKSPACE);
91          generatePlanetFeedData(session.getRootNode());
92          log.info("Finished generating Planet archive.");
93          return true;
94      }
95  
96      /**
97       * Process all RSS Aggregator feeds and create an archive of feed items if the feed is marked as planet feed.
98       */
99      void generatePlanetFeedData(Node root) {
100         try {
101             NodeIterator feeds = root.getNodes();
102             while (feeds.hasNext()) {
103                 Node feedOrFolderNode = feeds.nextNode();
104                 if (NodeUtil.isNodeType(feedOrFolderNode, RSSAggregatorNodeTypes.RSSAggregator.NAME)) {
105                     doGeneratePlanetDataForRSSAggregator(feedOrFolderNode);
106                 } else if (NodeUtil.isNodeType(feedOrFolderNode, NodeTypes.Folder.NAME)) {
107                     generatePlanetFeedData(feedOrFolderNode);
108                 }
109             }
110         } catch (RepositoryException e) {
111             log.error("Problem while copying feed data for planet: " + e.getMessage());
112         }
113     }
114 
115     private void doGeneratePlanetDataForRSSAggregator(Node feedNode) throws RepositoryException {
116         // only generate feed archive if the feed is marked as planet feed in the data module dialog
117         if (PlanetUtil.isPlanetNode(feedNode)) {
118             log.info("Storing data for planet feed " + feedNode.getName());
119             // create planetDataMode if it not exists
120             Node planetData = NodeUtil.createPath(feedNode, PLANET_DATANODE_NAME, NodeTypes.Content.NAME, true);
121 
122             if (feedNode.hasNode("data")) {
123                 // Get the node with the feeds for Hackergotchi lookup
124                 NodeIterator channels = feedNode.getNode("data").getNodes();
125                 // load filters
126                 Set<FilterPredicate> planetFilters = loadPlanetFilters(feedNode);
127 
128                 while (channels.hasNext()) {
129                     Node channel = channels.nextNode();
130                     processChannelEntries(planetData, channel, planetFilters);
131                 }
132             }
133         } else {
134             log.info("Items of feed " + feedNode.getName() + " will not be archived because the feed is not marked as Planet feed.");
135         }
136     }
137 
138     /**
139      * Iterate through feed data and create items in the planet archive.
140      *
141      * @param planetData Parent node where feed items are stored.
142      * @param channel The current feed channel where current feed data are stored.
143      * @param planetFilters Filter list from data dialog.
144      */
145     void processChannelEntries(Node planetData, Node channel, Set<FilterPredicate> planetFilters) {
146         try {
147             String postsParentNode = getPostsParent(planetData, NodeUtil.asList(NodeUtil.getNodes(channel)).size());
148 
149             Node target = NodeUtil.createPath(planetData, postsParentNode, NodeTypes.Content.NAME, true);
150             long entryCount = target.getNodes().getSize();
151             NodeIterator entries = channel.getNodes();
152             while (entries.hasNext()) {
153                 Node entry = entries.nextNode();
154                 entryCount += 1;
155                 createPlanetEntry(target, entry, entryCount, planetFilters);
156             }
157         } catch (RepositoryException e) {
158             log.error("Problem while processing channel entries: " + e.getMessage());
159         }
160     }
161 
162     /**
163      * Store a feed item within the planet archive.
164      *
165      * @param targetPath Parent node of the feed item.
166      * @param srcEntry Original feed item imported by the RSS feed fetcher.
167      * @param currEntry Number of current entry to be stored.
168      * @param planetFilters Filter list from data dialog.
169      */
170     void createPlanetEntry(Node targetPath, Node srcEntry, long currEntry, Set<FilterPredicate> planetFilters) {
171         try {
172             String author = PlanetUtil.formatName(srcEntry, "author");
173             String channelTitle = PlanetUtil.formatName(srcEntry, "channelTitle");
174             String title = PropertyUtil.getString(srcEntry, "title", "");
175             String description;
176             if (StringUtils.isNotBlank(PropertyUtil.getString(srcEntry, "content", ""))) {
177                 description = PropertyUtil.getString(srcEntry, "content", "");
178             } else {
179                 description = PropertyUtil.getString(srcEntry, "description", "");
180             }
181 
182             String link = PropertyUtil.getString(srcEntry, "link", "");
183             Long pubDate = srcEntry.hasProperty("pubDate") ? srcEntry.getProperty("pubDate").getLong() : null;
184 
185             // only store the post entry if all necessary attributes are there
186             if (StringUtils.isNotBlank(author) && StringUtils.isNotBlank(channelTitle) && StringUtils.isNotBlank(title)
187                     && StringUtils.isNotBlank(description) && StringUtils.isNotBlank(link) && pubDate != null) {
188                 // build the checksums
189                 String checksum1 = getPostChecksum(author + channelTitle + title + description + link + String.valueOf(pubDate));
190                 String checksum2 = getPostChecksum(description);
191 
192                 // only add the post if it doesn't exist
193                 if (!postExists(targetPath, link, checksum1, checksum2)) {
194                     if (includePost(planetFilters, srcEntry)) {
195                         Node channelNode = srcEntry.getParent();
196                         while (targetPath.hasNode(POST_ENTRY_PREFIX + currEntry)) {
197                             currEntry++;
198                         }
199                         Node trgEntry = NodeUtil.createPath(targetPath, POST_ENTRY_PREFIX + currEntry, NodeTypes.Content.NAME, true);
200                         PropertyUtil.setProperty(trgEntry, "checksum1", checksum1);
201                         PropertyUtil.setProperty(trgEntry, "checksum2", checksum2);
202                         PropertyUtil.setProperty(trgEntry, "author", author);
203                         PropertyUtil.setProperty(trgEntry, "channelTitle", channelTitle);
204                         PropertyUtil.setProperty(trgEntry, "title", title);
205                         PropertyUtil.setProperty(trgEntry, "description", description);
206                         PropertyUtil.setProperty(trgEntry, "link", link);
207                         PropertyUtil.setProperty(trgEntry, "pubDate", pubDate);
208                         PropertyUtil.setProperty(trgEntry, "authorLink", PropertyUtil.getString(channelNode, "link", ""));
209                         PropertyUtil.setProperty(trgEntry, "rssLink", PropertyUtil.getString(channelNode, "rss", ""));
210                         PropertyUtil.setProperty(trgEntry, "hidden", false);
211                         log.info("Added new blog post: " + StringUtils.abbreviate(title, 60));
212 
213                         session.save();
214                     }
215                 }
216             }
217         } catch (RepositoryException e) {
218             log.error("Problem while creating planet entry: " + e.getMessage());
219         }
220     }
221 
222     /**
223      * Determine if a feed item already exists based on checksums of the content.
224      *
225      * @param targetPath Archive node (parent) containing the entries to be checked.
226      * @param link Link to the blog post for comparison.
227      * @param check1 First checksum to compare (all post attributes).
228      * @param check2 Second checksum to compare (post description only).
229      * @return true, if the entry already exists, otherwise false
230      */
231     boolean postExists(Node targetPath, String link, String check1, String check2) {
232         boolean found = false;
233         if (targetPath != null) {
234             try {
235                 Node planetDataNode = targetPath.getParent();
236                 if (planetDataNode != null) {
237                     String sql = "select * from [mgnl:content] as t where ISDESCENDANTNODE([" + planetDataNode.getPath() + "]) and (t.link='" + link + "'" +
238                             " or t.checksum1='" + check1 + "' or t.checksum2='" + check2 + "')";
239 
240                     NodeIterator posts = QueryUtil.search(RSSAggregatorConstants.WORKSPACE, sql);
241                     if (posts.hasNext() && posts.nextNode() != null) {
242                         found = true;
243                     }
244                 }
245             } catch (RepositoryException e) {
246                 log.error("Problem while searching for post: " + e.getMessage());
247             }
248         }
249         return found;
250     }
251 
252     boolean includePost(Set<FilterPredicate> planetFilters, Node srcNode) {
253         if (planetFilters != null && planetFilters.size() > 0) {
254 
255             try {
256                 PlanetFilter planetFilter = new PlanetFilter(planetFilters);
257 
258                 SyndEntry entry = new SyndEntryImpl();
259                 String author = PlanetUtil.formatName(srcNode, "author");
260                 if (StringUtils.isNotBlank(author)) {
261                     entry.setAuthor(author);
262                 } else {
263                     entry.setAuthor(PlanetUtil.formatName(srcNode, "channelTitle"));
264                 }
265                 entry.setTitle(PropertyUtil.getString(srcNode, "title"));
266                 SyndContentImpl description = new SyndContentImpl();
267                 description.setValue(PropertyUtil.getString(srcNode, "description", ""));
268                 entry.setDescription(description);
269 
270                 return planetFilter.include(entry);
271             } catch (RepositoryException e) {
272                 log.error("Problem while filtering planet feed content: " + e.getMessage());
273             }
274         }
275 
276         return true;
277     }
278 
279     /**
280      * Create the name for the parent folder of archived post entries.
281      * This method is used to avoid having to many node entries under one parent node.
282      * It retrieves the newest archive folder and checks if the total number of existing entries under this node
283      * plus the additonal ones from the current run will exceed the number of maximum nodes allowed under one node.
284      * If the calculated number is below the maximum, the latest archive folder will be used. If the number reaches the
285      * maximum, a new node will be created for storage.
286      *
287      * @param pdNode Parent node where planet archive data are stored in
288      * @param entryCount number of entries that need to be added
289      * @return Name of the archive storage node
290      */
291     String getPostsParent(Node pdNode, long entryCount) {
292         String postsFolder = POSTS_FIRST;
293         int archCount;
294         try {
295             // determine existing archive nodes without the metadata node
296             archCount = NodeUtil.asList(NodeUtil.getNodes(pdNode)).size() > 0 ? (int) (NodeUtil.asList(NodeUtil.getNodes(pdNode)).size() - 1) : 0;
297             postsFolder = POSTS_PREFIX + StringUtils.leftPad(String.valueOf(archCount), 5, "0");
298 
299             // check how many entries this node already has
300             Node postsNode = NodeUtil.createPath(pdNode, postsFolder, NodeTypes.Content.NAME, true);
301             long existingEntries = postsNode.getNodes().getSize();
302 
303             // check if we need a new parent node to avoid having to many netries under one parent node
304             if (existingEntries + entryCount > MAX_NODE_ENTRIES) {
305                 postsFolder = POSTS_PREFIX + StringUtils.leftPad(String.valueOf(archCount + 1), 5, "0");
306             }
307         } catch (RepositoryException e) {
308             log.error("Problem while getting number of highest posts node: " + e.getMessage());
309         }
310         return postsFolder;
311     }
312 
313     /**
314      * Create a checksum from a String input.
315      *
316      * @param postContent String content as checkusm input
317      * @return MD5 checksum
318      */
319     String getPostChecksum(String postContent) {
320         String checksum = null;
321         MessageDigest md;
322 
323         try {
324             md = MessageDigest.getInstance("MD5");
325             md.reset();
326             md.update(postContent.getBytes(), 0, postContent.length());
327             checksum = new BigInteger(1, md.digest()).toString(16);
328         } catch (Exception e) {
329             log.error("Problem while creating checksum for post: " + e.getMessage());
330         }
331         return checksum;
332     }
333 
334     Set<FilterPredicate> loadPlanetFilters(Node feedNode) {
335         Set<FilterPredicate> planetFilters = new HashSet<FilterPredicate>();
336 
337         try {
338             Node filtersNode = feedNode.hasNode("filters") ? feedNode.getNode("filters") : null;
339 
340             if (filtersNode != null) {
341                 FilterPredicateContentMapper filterPredicateMapper = new FilterPredicateContentMapper();
342                 List<Node> filterNodes = NodeUtil.asList(NodeUtil.getNodes(filtersNode, VersionUtil.getNodeTypeName(filtersNode)));
343 
344                 for (Node n : filterNodes) {
345                     FilterPredicate filterPredicate = filterPredicateMapper.map(n);
346                     if (filterPredicate == null) {
347                         continue;
348                     }
349                     planetFilters.add(filterPredicate);
350                 }
351             }
352         } catch (RepositoryException e) {
353             log.error("Problem while retrieving planet feed node filters: " + e.getMessage());
354         }
355 
356         return planetFilters;
357     }
358 
359 }