rssread revision da2e3ebdc1edfbc5028edf1354e7dd2fa69a7968
#!/bin/ksh93
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright 2007 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
# ident "%Z%%M% %I% %E% SMI"
#
#
# rssread - a simple RSS2.0 reader with RSS to XHTML to
# plaintext conversion.
#
# Solaris needs /usr/xpg4/bin/ because the tools in /usr/bin are not POSIX-conformant
export PATH=/usr/xpg4/bin:/bin:/usr/bin
function printmsg
{
print -u 2 "$@"
}
function debugmsg
{
# printmsg "$@"
true
}
function fatal_error
{
print -u 2 "${progname}: $@"
exit 1
}
function cat_http
{
(
protocol="${1%://*}"
path1="${1#*://}" # "http://foo.bat.net/x/y.html" ----> "foo.bat.net/x/y.html"
host="${path1%%/*}"
path="${path1#*/}"
port="${host##*:}"
# If URL did not contain a port number in the host part then look at the
# protocol to get the port number
if [ "${port}" = "${host}" ] ; then
case "${protocol}" in
"http") port=80 ;;
*) port="$(getent services "${protocol}" | sed 's/[^0-9]*//;s/\/.*//')" ;;
esac
else
host="${host%:*}"
fi
printmsg "protocol=${protocol} port=${port} host=${host} path=${path}"
# prechecks
[ "${protocol}" = "" ] && fatal_error "protocol not set."
[ "${port}" = "" ] && fatal_error "port not set."
[ "${host}" = "" ] && fatal_error "host not set."
[ "${path}" = "" ] && fatal_error "path not set."
# open TCP channel
exec 3<>"/dev/tcp/${host}/${port}"
# send HTTP request
request="GET /${path} HTTP/1.0\n"
request+="Host: ${host}\n"
request+="User-Agent: ksh93/rssread (2007-01-16; $(uname -s -r -p))\n"
print "${request}\n" >&3
# collect response and send it to stdout
cat <&3
)
}
function html_entity_to_ascii
{
typeset -A entity_cache
# Todo: Add more HTML/MathML entities here
entity_cache["nbsp"]=" "
entity_cache["lt"]="<"
entity_cache["gt"]=">"
entity_cache["amp"]="&"
entity_cache["quot"]="\""
entity_cache["apos"]="'"
buf=""
while read -r -N 1 c ; do
if [ "$c" != "&" ] ; then
printf "%s" "${c}"
continue
fi
entity=""
while read -r -N 1 c ; do
case "$c" in
";")
break
;;
~(Eilr)[a-z0-9#])
entity+="$c"
continue
;;
*)
debugmsg "error &${entity}${c}#"
print -n "${entity}${c}"
entity=""
continue 2
;;
esac
done
value=""
if [ "${entity_cache["${entity}"]}" != "" ] ; then
debugmsg "match #${entity}# = #${entity_cache["${entity}"]}#"
value="${entity_cache["${entity}"]}"
else
if [ "${entity:0:1}" = "#" ] ; then
# decimal literal
value="$(printf "\u[$(printf "%x" "${entity:1:8}")]")"
elif [[ "${entity:0:7}" = ~(Eilr)[0-9a-f]* ]] ; then
# hexadecimal literal
value="$(printf "\u[${entity:0:7}]")"
else
# unknown literal - pass-through
value="<ENT=${entity}>"
fi
entity_cache["${entity}"]="${value}"
debugmsg "lookup #${entity}# = #${entity_cache["${entity}"]}#"
fi
printf "%s" "$value"
done
}
# dumb xhtml handler - no CSS, tables, images, iframes or nested
# structures are supported (and we assume that the input is correct
# xhtml). The code was written in a trial&&error manner and should be
# rewritten to parse xhtml correctly.
function handle_html
{
# we can't use global variables here when multiple callbacks use the same
# callback function - but we can use the callback associative array for
# variable storage instead
nameref callbacks=${1}
tag_type="$2"
tag_value="$3"
case "${tag_type}" in
tag_begin)
case "${tag_value}" in
br*) printf "\n" ;;
hr*) printf "\n-------------------------------------\n" ;;
pre*) callbacks["html_pre"]=1 ;;
p*) printf "\n" ;;
esac
;;
tag_end)
case "${tag_value}" in
pre*) callbacks["html_pre"]=0 ;;
esac
;;
tag_text)
if [ ${callbacks["html_pre"]} -eq 1 ] ; then
printf "%s" "${tag_value}"
else
# compress spaces/newlines/tabs/etc.
printf "%s" "${tag_value/+([\n\r\t\v[:space:][:blank:]])/ }"
fi
;;
document_start)
callbacks["html_pre"]=0
;;
document_end) ;;
esac
}
function handle_rss
{
# we can't use global variables here when multiple callbacks use the same
# callback function - but we can use the callback associative array for
# variable storage instead
nameref callbacks=${1}
tag_type="$2"
tag_value="$3"
case "${tag_type}" in
tag_begin)
case "${tag_value}" in
item*)
item["title"]=""
item["link"]=""
item["tag"]=""
item["description"]=""
;;
esac
callbacks["textbuf"]=""
;;
tag_end)
case "${tag_value}" in
item*)
# note that each RSS item needs to be converted seperately from RSS to HTML to plain text
# to make sure that the state of one RSS item doesn't affect others
(
printf $"<br />#### RSS item: title: %s ####" "${item["title"]}"
printf $"<br />## author: %s" "${item["author"]}"
printf $"<br />## link: %s" "${item["link"]}"
printf $"<br />## date: %s" "${item["pubDate"]}"
printf $"<br />## begin description:"
printf $"<br />%s<br />" "${item["description"]}"
printf $"<br />## end description<br />"
print # extra newline to make sure the sed pipeline gets flushed
) |
html_entity_to_ascii | # convert XML entities (e.g. decode RSS content to HTML code)
xml_tok "xhtmltok_cb" | # convert HTML to plain text
html_entity_to_ascii # convert HTML entities
;;
title*) item["title"]="${callbacks["textbuf"]}" ; callbacks["textbuf"]="" ;;
link*) item["link"]="${callbacks["textbuf"]}" ; callbacks["textbuf"]="" ;;
dc:creator* | author*) item["author"]="${callbacks["textbuf"]}" ; callbacks["textbuf"]="" ;;
dc:date* | pubDate*) item["pubDate"]="${callbacks["textbuf"]}" ; callbacks["textbuf"]="" ;;
description*) item["description"]="${callbacks["textbuf"]}" ; callbacks["textbuf"]="" ;;
esac
callbacks["textbuf"]=""
;;
tag_text)
callbacks["textbuf"]+="${tag_value}"
;;
document_start) ;;
document_end) ;;
esac
}
function xml_tok
{
typeset buf=""
typeset c=""
nameref callbacks=${1}
[ ! -z "${callbacks["document_start"]}" ] && ${callbacks["document_start"]} "${1}" "document_start"
while read -N 1 c -d '\0'; do
isendtag=false
if [ "$c" = "<" ] ; then
if [ "$buf" != "" ] ; then
[ ! -z "${callbacks["tag_text"]}" ] && ${callbacks["tag_text"]} "${1}" "tag_text" "$buf"
buf=""
fi
read -N 1 c -d '\0'
if [ "$c" = "/" ] ; then
isendtag=true
else
buf="$c"
fi
read -d '>' c
buf+="$c"
if ${isendtag} ; then
[ ! -z "${callbacks["tag_end"]}" ] && ${callbacks["tag_end"]} "${1}" "tag_end" "$buf"
else
[ ! -z "${callbacks["tag_begin"]}" ] && ${callbacks["tag_begin"]} "${1}" "tag_begin" "$buf"
# handle tags like <br/> (which are start- and end-tag in one piece)
if [[ "${buf}" = ~(Er).*/ ]] ; then
[ ! -z "${callbacks["tag_end"]}" ] && ${callbacks["tag_end"]} "${1}" "tag_end" "$buf"
fi
fi
buf=""
else
buf+="$c"
fi
done
[ ! -z "${callbacks["document_end"]}" ] && ${callbacks["document_start"]} "${1}" "document_end" "exit_success"
print # final newline to make filters like "sed" happy
}
# return the value of LC_MESSAGES needed for subprocesses which
# want to run in a different locale/encoding
function get_lc_messages
{
[ "${LC_ALL}" != "" ] && { print "${LC_ALL}" ; return 0 ; }
[ "${LC_MESSAGES}" != "" ] && { print "${LC_MESSAGES}" ; return 0 ; }
[ "${LANG}" != "" ] && { print "${LANG}" ; return 0 ; }
print "C" ; return 0
}
function usage
{
OPTIND=0
getopts -a "${progname}" "${USAGE}" OPT '-?'
exit 2
}
# make sure we use the ksh93 builtin versions
builtin cat
builtin printf
typeset -A rsstok_cb # callbacks for xml_tok
rsstok_cb["tag_begin"]="handle_rss"
rsstok_cb["tag_end"]="handle_rss"
rsstok_cb["tag_text"]="handle_rss"
rsstok_cb["textbuf"]=""
typeset -A xhtmltok_cb # callbacks for xml_tok
xhtmltok_cb["tag_begin"]="handle_html"
xhtmltok_cb["tag_end"]="handle_html"
xhtmltok_cb["tag_text"]="handle_html"
xhtmltok_cb["textbuf"]=""
xhtmltok_cb["html_pre"]=0
typeset -A item
typeset -A bookmark_urls
# "ramdom" urls for testing
bookmark_urls=(
["google_blogs_ksh"]="http://blogsearch.google.com/blogsearch_feeds?hl=en&scoring=d&q=ksh&ie=utf-8&num=100&output=rss"
["ksh93_integration"]="http://www.opensolaris.org/rss/os/project/ksh93-integration/announcements/rss2.xml"
["blogs_sun_com"]="http://blogs.sun.com/main/feed/entries/rss"
["jmcp"]="http://www.jmcp.homeunix.com/roller/rss/jmcp"
["katakai"]="http://blogs.sun.com/katakai/feed/entries/rss"
["planetsun"]="http://www.planetsun.org/rss20.xml"
["planetsolaris"]="http://www.planetsolaris.org/rss20.xml"
["planetopensolaris"]="http://planet.opensolaris.org/rss20.xml"
)
progname="${0}"
USAGE=$'
[-?
@(#)\$Id: rssread (Roland Mainz) 2007-06-05 \$
]
[+NAME?rssread - fetch RSS messages and convert them to plain text]
[+DESCRIPTION?\brssread\b RSS to plain text converter
which fetches RSS streams via HTTP and converts them from RSS to HTML to plain UTF-8 text.]
[ url ]
[+SEE ALSO?\bksh93\b(1)]
'
while getopts -a "${progname}" "${USAGE}" OPT ; do
# printmsg "## OPT=|${OPT}|, OPTARG=|${OPTARG}|"
case ${OPT} in
*) usage ;;
esac
done
shift ${OPTIND}-1
url="$1"
if [ "$url" = "" ] ; then
fatal_error $"No url given."
fi
if [ "${bookmark_urls[${url}]}" != "" ] ; then
printmsg $"Using bookmark ${url} = ${bookmark_urls[${url}]}"
url="${bookmark_urls[${url}]}"
fi
(
# set unicode locale since RSS is encoded in UTF-8
# (and make sure $LC_MESSAGES is set to the parent
# process's locale that all error messages are using
# the callers locale/encoding)
export \
LC_MESSAGES="$(get_lc_messages)" \
LC_MONETARY="en_US.UTF-8" \
LC_NUMERIC="en_US.UTF-8" \
LC_COLLATE="en_US.UTF-8" \
LC_CTYPE="en_US.UTF-8" \
LC_TIME="en_US.UTF-8" \
LANG="en_US.UTF-8"
cat_http "$url" |
xml_tok "rsstok_cb"
) # | iconv -f "UTF-8" - -
#EOF.