1 |
package skrueger; |
2 |
|
3 |
import java.io.FileWriter; |
4 |
import java.io.StringWriter; |
5 |
import java.io.Writer; |
6 |
import java.net.HttpURLConnection; |
7 |
import java.net.URL; |
8 |
import java.net.URLConnection; |
9 |
import java.net.URLEncoder; |
10 |
import java.util.Date; |
11 |
|
12 |
import javax.xml.parsers.DocumentBuilderFactory; |
13 |
import javax.xml.transform.OutputKeys; |
14 |
import javax.xml.transform.Result; |
15 |
import javax.xml.transform.Transformer; |
16 |
import javax.xml.transform.TransformerFactory; |
17 |
import javax.xml.transform.dom.DOMSource; |
18 |
import javax.xml.transform.stream.StreamResult; |
19 |
|
20 |
import org.apache.log4j.Logger; |
21 |
import org.w3c.dom.Document; |
22 |
import org.w3c.dom.Element; |
23 |
|
24 |
/** |
25 |
* This class provides an easy way to list all pages of your site (including |
26 |
* generates ones) ina sitemap.xml. See http://sitemaps.org/protocol.php for the |
27 |
* XML specification. |
28 |
* |
29 |
* <br/> |
30 |
* XML output is based on org.w3c.dom |
31 |
* |
32 |
* |
33 |
* @author Stefan Tzeggai |
34 |
*/ |
35 |
public class Sitemap { |
36 |
|
37 |
static String[] SEARCHENGINES = new String[] { |
38 |
"http://www.sitemapwriter.com/notify.php?crawler=all&url=", |
39 |
"http://submissions.ask.com/ping?sitemap=", |
40 |
"http://www.google.com/webmasters/tools/ping?sitemap=", |
41 |
"http://api.moreover.com/ping?u=", |
42 |
"http://www.bing.com/webmaster/ping.aspx?siteMap=", |
43 |
"http://search.yahooapis.com/SiteExplorerService/V1/ping?sitemap=" }; |
44 |
|
45 |
final static Logger log = Logger.getLogger(Sitemap.class); |
46 |
|
47 |
private Document document; |
48 |
|
49 |
final private Element urlsetElement; |
50 |
|
51 |
/** |
52 |
* Returns the number of URLs. May not be more than 50,000 by spec. |
53 |
*/ |
54 |
private int size = 0; |
55 |
|
56 |
final static String NSURL = "http://www.sitemaps.org/schemas/sitemap/0.9"; |
57 |
|
58 |
public enum CHANGEFREQ { |
59 |
monthly, daily, weekly |
60 |
} |
61 |
|
62 |
public Sitemap() { |
63 |
|
64 |
try { |
65 |
|
66 |
// Create a DOM builder and parse the fragment |
67 |
final DocumentBuilderFactory factory = DocumentBuilderFactory |
68 |
.newInstance(); |
69 |
document = factory.newDocumentBuilder().newDocument(); |
70 |
|
71 |
// XML root element |
72 |
urlsetElement = document.createElementNS(NSURL, "urlset"); |
73 |
document.appendChild(urlsetElement); |
74 |
|
75 |
// // Linking this XML to the AtlasML Schema |
76 |
// final Attr namespaces = document.createAttributeNS( |
77 |
// "http://www.w3.org/2001/XMLSchema-instance", "schemaLocation"); |
78 |
// namespaces |
79 |
// .setValue(NSURL+" http://localhost:" |
80 |
// + Webserver.DEFAULTPORT |
81 |
// + "/skrueger/atlas/resource/AtlasML.xsd"); |
82 |
// urlset.setAttributeNode(namespaces); |
83 |
|
84 |
} catch (Exception e) { |
85 |
throw new RuntimeException( |
86 |
"Sitemap org.w3c.xml newDocumentBuilder failed:", e); |
87 |
} |
88 |
} |
89 |
|
90 |
/** |
91 |
* |
92 |
* @param locString |
93 |
* @param lastmod |
94 |
* when was the page modified the last time? May be |
95 |
* <code>null</code>. |
96 |
* @param changefreq |
97 |
* How often does the page change. May be <code>null</code>. |
98 |
* @param priority |
99 |
* Priority 0.0 - 1.0. May be <code>null</code>. |
100 |
* @return <code>true</code> if the {@link Sitemap} contains less than 50000 |
101 |
* urls and the page was added. |
102 |
*/ |
103 |
public boolean addUrl(String locString, Date lastmod, |
104 |
CHANGEFREQ changefreq, Double priority) { |
105 |
size++; |
106 |
if (size >= 50000) |
107 |
return false; |
108 |
|
109 |
Element urlElement = document.createElement("url"); |
110 |
|
111 |
if (locString == null) |
112 |
throw new IllegalArgumentException("location must be provided"); |
113 |
else { |
114 |
Element e = document.createElement("loc"); |
115 |
e.appendChild(document.createTextNode(locString)); |
116 |
urlElement.appendChild(e); |
117 |
} |
118 |
|
119 |
if (lastmod != null) { |
120 |
Element e = document.createElement("lastmod"); |
121 |
e.appendChild(document.createTextNode(lastmod.toString())); |
122 |
urlElement.appendChild(e); |
123 |
} |
124 |
|
125 |
if (changefreq != null) { |
126 |
Element e = document.createElement("changefreq"); |
127 |
e.appendChild(document.createTextNode(changefreq.toString())); |
128 |
urlElement.appendChild(e); |
129 |
} |
130 |
|
131 |
if (priority != null) { |
132 |
Element e = document.createElement("priority"); |
133 |
e.appendChild(document.createTextNode(priority.toString())); |
134 |
urlElement.appendChild(e); |
135 |
} |
136 |
urlsetElement.appendChild(urlElement); |
137 |
return true; |
138 |
} |
139 |
|
140 |
public Document getDocument() { |
141 |
return document; |
142 |
} |
143 |
|
144 |
public String getXmlString() { |
145 |
StringWriter stringWriter = new StringWriter(); |
146 |
outputToWriter(stringWriter); |
147 |
return stringWriter.toString(); |
148 |
} |
149 |
|
150 |
/** |
151 |
* Can be used to write to a File with {@link FileWriter}. |
152 |
*/ |
153 |
public void outputToWriter(Writer stringWriter) { |
154 |
try { |
155 |
|
156 |
try { // close outputStreamWriter.close(); |
157 |
|
158 |
// **************************************************************************** |
159 |
// Create the XML |
160 |
// **************************************************************************** |
161 |
final Result result = new StreamResult(stringWriter); |
162 |
|
163 |
// with indenting to make it human-readable |
164 |
final TransformerFactory tf = TransformerFactory.newInstance(); |
165 |
|
166 |
// TODO Ging mit xerces, geht nicht mehr mit xalan ?! |
167 |
// tf.setAttribute("indent-number", new Integer(2)); |
168 |
|
169 |
final Transformer xformer = tf.newTransformer(); |
170 |
xformer.setOutputProperty(OutputKeys.INDENT, "yes"); |
171 |
xformer.setOutputProperty( |
172 |
"{http://xml.apache.org/xalan}indent-amount", "2"); |
173 |
|
174 |
// Write the DOM document to the file |
175 |
xformer.transform(new DOMSource(document), result); |
176 |
|
177 |
} finally { |
178 |
stringWriter.close(); |
179 |
} |
180 |
|
181 |
} catch (Exception e) { |
182 |
log.error("Failed to create sitemap.XML-String", e); |
183 |
throw new RuntimeException(e); |
184 |
} |
185 |
} |
186 |
|
187 |
public int getSize() { |
188 |
return size; |
189 |
} |
190 |
|
191 |
public static void submitToSearchEngines(String urlToSitemap) { |
192 |
try { |
193 |
|
194 |
urlToSitemap = URLEncoder.encode(urlToSitemap, "UTF-8"); |
195 |
for (String se : SEARCHENGINES) { |
196 |
log.info("Submitting " + urlToSitemap + " to " + se); |
197 |
try { |
198 |
URL url = new URL(se + urlToSitemap); |
199 |
|
200 |
URLConnection openConnection = url.openConnection(); |
201 |
if (openConnection instanceof HttpURLConnection) { |
202 |
HttpURLConnection httpconnection = ((HttpURLConnection) openConnection); |
203 |
if (httpconnection.getResponseCode() != 200) { |
204 |
log.error(se + " returned ResponseCode " |
205 |
+ httpconnection.getResponseCode() |
206 |
+ " for " + urlToSitemap); |
207 |
} |
208 |
} |
209 |
|
210 |
} catch (Exception e) { |
211 |
log.error("failed to submit " + urlToSitemap + " to " + se, |
212 |
e); |
213 |
} |
214 |
} |
215 |
} catch (Exception e) { |
216 |
log.error("Error URL encoding " + urlToSitemap, e); |
217 |
} |
218 |
} |
219 |
} |