First draft very simple version.... If all links were just to external pages, this would work fine.
However, since there are links within the wiki, it really should deal with intra-wiki links and make sure they are properly formatted.....
private static String getMediaWikitextFromLinksInHTML(String sPageHTML) {
String sMediaWikitextListOfLinks="";
Matcher m;
// Drawback of this routine: It loses all the text on the page not within a link text
// Turn links into wikitext
Pattern rxFindLinks = Pattern.compile("<a.+?href=\"(.+?)\".+?>(.+?)</a>");
m = rxFindLinks.matcher(sPageHTML);
while (m.find())
{
String sThisLink = "* [["+m.group(1)+"|"+m.group(2)+"]]";
System.out.println(sThisLink);
sMediaWikitextListOfLinks += sThisLink+"\r\n";
}
return sMediaWikitextListOfLinks;
}
Labels:
None
Add Comment