package org.apache.lucene.analysis.standard;
/*
* Copyright 2001-2005 The Apache Software Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.net.URL;
import java.net.URLConnection;
import java.text.DateFormat;
import java.util.Date;
import java.util.Locale;
import java.util.SortedSet;
import java.util.TimeZone;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Generates a file containing JFlex macros to accept valid ASCII TLDs
* (top level domains), for inclusion in JFlex grammars that can accept
* domain names.
* <p/>
* The IANA Root Zone Database is queried via HTTP from URL cmdline arg #0, the
* response is parsed, and the results are written out to a file containing
* a JFlex macro that will accept all valid ASCII-only TLDs, including punycode
* forms of internationalized TLDs (output file cmdline arg #1).
*/
public class GenerateJflexTLDMacros {
public static void main(String... args) throws Exception {
if (args.length != 2 || args[0].equals("--help") || args[0].equals("-help")) {
System.err.println("Cmd line params:");
System.err.println("\tjava " + GenerateJflexTLDMacros.class.getName()
+ "<ZoneFileURL> <JFlexOutputFile>");
System.exit(1);
}
new GenerateJflexTLDMacros(args[0], args[1]).execute();
}
private static final String NL = System.getProperty("line.separator");
private static final String APACHE_LICENSE
= "/*" + NL
+ " * Copyright 2001-2005 The Apache Software Foundation." + NL
+ " *" + NL
+ " * Licensed under the Apache License, Version 2.0 (the \"License\");" + NL
+ " * you may not use this file except in compliance with the License." + NL
+ " * You may obtain a copy of the License at" + NL
+ " *" + NL
+ " * http://www.apache.org/licenses/LICENSE-2.0" + NL
+ " *" + NL
+ " * Unless required by applicable law or agreed to in writing, software" + NL
+ " * distributed under the License is distributed on an \"AS IS\" BASIS," + NL
+ " * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied." + NL
+ " * See the License for the specific language governing permissions and" + NL
+ " * limitations under the License." + NL
+ " */" + NL + NL;
private static final Pattern TLD_PATTERN_1
= Pattern.compile("([-A-Za-z0-9]+)\\.\\s+NS\\s+.*");
private static final Pattern TLD_PATTERN_2
= Pattern.compile("([-A-Za-z0-9]+)\\.\\s+\\d+\\s+IN\\s+NS\\s+.*");
private final URL tldFileURL;
private long tldFileLastModified = -1L;
private final File outputFile;
public GenerateJflexTLDMacros(String tldFileURL, String outputFile)
throws Exception {
this.tldFileURL = new URL(tldFileURL);
this.outputFile = new File(outputFile);
}
/**
* Downloads the IANA Root Zone Database, extracts the ASCII TLDs, then
* writes a JFlex macro accepting any of them case-insensitively out to
* the specified output file.
*
* @throws IOException if there is a problem either downloading the database
* or writing out the output file.
*/
public void execute() throws IOException {
final SortedSet<String> TLDs = getIANARootZoneDatabase();
writeOutput(TLDs);
System.err.println("Wrote " + TLDs.size() + " top level domains to '"
+ outputFile + "'.");
}
/**
* Downloads the IANA Root Zone Database.
* @return downcased sorted set of ASCII TLDs
* @throws java.io.IOException if there is a problem downloading the database
*/
private SortedSet<String> getIANARootZoneDatabase() throws IOException {
final SortedSet<String> TLDs = new TreeSet<String>();
final URLConnection connection = tldFileURL.openConnection();
connection.setUseCaches(false);
connection.addRequestProperty("Cache-Control", "no-cache");
connection.connect();
tldFileLastModified = connection.getLastModified();
BufferedReader reader = new BufferedReader
(new InputStreamReader(connection.getInputStream(), "US-ASCII"));
try {
String line;
while (null != (line = reader.readLine())) {
Matcher matcher = TLD_PATTERN_1.matcher(line);
if (matcher.matches()) {
TLDs.add(matcher.group(1).toLowerCase(Locale.US));
} else {
matcher = TLD_PATTERN_2.matcher(line);
if (matcher.matches()) {
TLDs.add(matcher.group(1).toLowerCase(Locale.US));
}
}
}
} finally {
reader.close();
}
return TLDs;
}
/**
* Writes a file containing a JFlex macro that will accept any of the given
* TLDs case-insensitively.
*
* @param ASCIITLDs The downcased sorted set of top level domains to accept
* @throws IOException if there is an error writing the output file
*/
private void writeOutput(SortedSet<String> ASCIITLDs) throws IOException {
final DateFormat dateFormat = DateFormat.getDateTimeInstance
(DateFormat.FULL, DateFormat.FULL, Locale.US);
dateFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
final Writer writer = new OutputStreamWriter
(new FileOutputStream(outputFile), "UTF-8");
try {
writer.write(APACHE_LICENSE);
writer.write("// Generated from IANA Root Zone Database <");
writer.write(tldFileURL.toString());
writer.write(">");
writer.write(NL);
if (tldFileLastModified > 0L) {
writer.write("// file version from ");
writer.write(dateFormat.format(tldFileLastModified));
writer.write(NL);
}
writer.write("// generated on ");
writer.write(dateFormat.format(new Date()));
writer.write(NL);
writer.write("// by ");
writer.write(this.getClass().getName());
writer.write(NL);
writer.write(NL);
writer.write("ASCIITLD = \".\" (");
writer.write(NL);
boolean isFirst = true;
for (String ASCIITLD : ASCIITLDs) {
writer.write("\t");
if (isFirst) {
isFirst = false;
writer.write(" ");
} else {
writer.write("| ");
}
writer.write(getCaseInsensitiveRegex(ASCIITLD));
writer.write(NL);
}
writer.write("\t) \".\"? // Accept trailing root (empty) domain");
writer.write(NL);
writer.write(NL);
} finally {
writer.close();
}
}
/**
* Returns a regex that will accept the given ASCII TLD case-insensitively.
*
* @param ASCIITLD The ASCII TLD to generate a regex for
* @return a regex that will accept the given ASCII TLD case-insensitively
*/
private String getCaseInsensitiveRegex(String ASCIITLD) {
StringBuilder builder = new StringBuilder();
for (int pos = 0 ; pos < ASCIITLD.length() ; ++pos) {
char ch = ASCIITLD.charAt(pos);
if (Character.isDigit(ch) || ch == '-') {
builder.append(ch);
} else {
builder.append("[").append(ch).append(Character.toUpperCase(ch)).append("]");
}
}
return builder.toString();
}
}